In [None]:
import logging
import importlib
from pprint import pprint

from IPython.display import Audio
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow_io as tfio
import tensorflow as tf
import numpy as np
import src.preprocess
import src.io

importlib.reload(src.preprocess)
importlib.reload(src.io) 

DATA_ROOT = "gs://bird-clef-kimmo/data"
TRAIN_SHORT_AUDIO_DATA = f"{DATA_ROOT}/train_short_audio"

logging.basicConfig(level=logging.INFO)

SR = 32000
SPLIT_SECS = 5

In [None]:
!gsutil ls $DATA_ROOT

In [None]:
train_metadata_ds = src.io.train_metadata_ds()

In [None]:
for sample in train_metadata_ds.take(1):
    pprint(sample)

In [None]:
CLASSES = src.io.read_classes()
print(CLASSES)

In [None]:
def read_file(url) -> tf.Tensor:
    logging.debug(f"Reading file: {url}")
    return tf.squeeze(tfio.audio.AudioIOTensor(url).to_tensor(), axis=1)  # remove channel axis

def add_audio(row):
    filename = row["filename"]
    primary_label = row["primary_label"]
    file_url = TRAIN_SHORT_AUDIO_DATA + "/" + primary_label + "/" + filename
    [audio,] = tf.py_function(read_file, [file_url], [tf.float32])
    return {**row, "file_url": file_url, "audio": audio}

for row in train_metadata_ds.map(add_audio).take(1):
    pprint(row)


In [None]:
rows = 3
cols = 3
n = rows * cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 12))

for i, sample in enumerate(train_metadata_ds.map(add_audio).take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    audio = sample["audio"].numpy()
    label = sample["scientific_name"].numpy().decode()
    ax.plot(np.arange(len(audio)) / SR, audio)
    ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    # label = label.numpy().decode('utf-8')
    ax.set_title(label)
    
plt.show()

Audio(audio, rate=SR)

In [None]:
def split_audio(audio):
    length = audio.shape[-1]
    
    n_splits = length // (SR * SPLIT_SECS)
        
    splits = []
    
    for i in range(n_splits):
        start = i * SR*SPLIT_SECS
        end = (i+1) * SR*SPLIT_SECS
        split = audio[start:end]
        splits.append(split)
    
    return tf.convert_to_tensor(splits)

def split_to_segments(rows):
    audio = rows["audio"]
    
    splits = tf.py_function(split_audio, [audio], tf.float32)
    
    original = tf.data.Dataset.from_tensors(rows).repeat()
    splits_ = tf.data.Dataset.from_tensor_slices({"segment": splits})
    
    def combine_keys(a, b):
        return {**a, **b}
    
    zipped = tf.data.Dataset.zip((original, splits_)).map(combine_keys)
    return zipped

def add_label(sample):
    label = src.io.primary_label_to_tensor(sample["primary_label"], CLASSES)
    return sample, label

def drop_keys(*keys):
    
    def drop(rows):
        rows = rows.copy()
        for key in keys:
            rows.pop(key)
        return rows

    return drop

short_audio_ds = train_metadata_ds.map(add_audio).flat_map(split_to_segments).map(drop_keys("filename")).map(add_label)

for sample, label in short_audio_ds.take(3):
    pprint(sample)
    pprint(label)