In [None]:
import logging
import importlib
from pprint import pprint

from IPython.display import Audio
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow_io as tfio
import tensorflow as tf
import numpy as np
import src.preprocess
from src import dataset

import src.spectrogram

importlib.reload(src.preprocess)
importlib.reload(dataset) 
importlib.reload(src.spectrogram)

DATA_ROOT = "gs://bird-clef-kimmo/data"
TRAIN_SHORT_AUDIO_DATA = f"{DATA_ROOT}/train_short_audio"

logging.basicConfig(level=logging.INFO)

SR = 32000
SPLIT_SECS = 5

In [None]:
!gsutil ls $DATA_ROOT

In [None]:
short_audio_metadata_ds = src.dataset.short_audio_metadata_ds()

In [None]:
for sample in short_audio_metadata_ds.take(1):
    pprint(sample)

In [None]:
CLASSES = src.dataset.read_classes()
print(CLASSES)

In [None]:
def read_file(url) -> tf.Tensor:
    logging.debug(f"Reading file: {url}")
    return tf.squeeze(tfio.audio.AudioIOTensor(url).to_tensor(), axis=1)  # remove channel axis

def add_audio(row):
    filename = row["filename"]
    primary_label = row["primary_label"]
    file_url = TRAIN_SHORT_AUDIO_DATA + "/" + primary_label + "/" + filename
    [audio,] = tf.py_function(read_file, [file_url], [tf.float32])
    return {**row, "file_url": file_url, "audio": audio}

for row in short_audio_metadata_ds.map(add_audio).take(1):
    pprint(row)


In [None]:
rows = 3
cols = 3
n = rows * cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 12))

for i, sample in enumerate(short_audio_metadata_ds.map(add_audio).take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    audio = sample["audio"].numpy()
    label = sample["scientific_name"].numpy().decode()
    ax.plot(np.arange(len(audio)) / SR, audio)
    ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    # label = label.numpy().decode('utf-8')
    ax.set_title(label)
    
plt.show()

Audio(audio, rate=SR)

In [None]:
def split_audio(audio):
    length = audio.shape[-1]
    
    n_splits = length // (SR * SPLIT_SECS)
        
    splits = []
    
    for i in range(n_splits):
        start = i * SR*SPLIT_SECS
        end = (i+1) * SR*SPLIT_SECS
        split = audio[start:end]
        splits.append(split)
    
    return tf.convert_to_tensor(splits)

def split_to_segments(rows):
    audio = rows["audio"]
    
    splits = tf.py_function(split_audio, [audio], tf.float32)
    
    original = tf.data.Dataset.from_tensors(rows).repeat()
    splits_ = tf.data.Dataset.from_tensor_slices({"segment": splits})
    
    def combine_keys(a, b):
        return {**a, **b}
    
    zipped = tf.data.Dataset.zip((original, splits_)).map(combine_keys)
    return zipped

def add_label(sample):
    label = src.dataset.primary_label_to_tensor(sample["primary_label"], CLASSES)
    return sample, label

def drop_keys(*keys):
    
    def drop(rows):
        rows = rows.copy()
        for key in keys:
            rows.pop(key)
        return rows

    return drop

short_audio_ds = short_audio_metadata_ds.map(add_audio).flat_map(split_to_segments).map(drop_keys("filename")).map(add_label)

for sample, label in short_audio_ds.take(3):
    pprint(sample)
    pprint(label)

In [None]:
"""
# Mel spectrogram calculation in reference notebook
# https://www.kaggle.com/stefankahl/birdclef2021-model-training
RANDOM_SEED = 1337
SAMPLE_RATE = 32000
SIGNAL_LENGTH = 5 # seconds
SPEC_SHAPE = (48, 128) # height x width
FMIN = 500
FMAX = 12500
MAX_AUDIO_FILES = 1500

hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                          sr=SAMPLE_RATE, 
                                          n_fft=1024, 
                                          hop_length=hop_length, 
                                          n_mels=SPEC_SHAPE[0], 
                                          fmin=FMIN, 
                                          fmax=FMAX)
                                                
"""

SPEC_SHAPE = (48, 128)
FMIN = 500
FMAX = 12500

def make_spectrogram_numpy(audio):
    hop_length = int(SPLIT_SECS * SR / (SPEC_SHAPE[1] - 1))
    mel_spec = librosa.feature.melspectrogram(y=audio.numpy(), 
                                              sr=SR, 
                                              n_fft=1024, 
                                              hop_length=hop_length, 
                                              n_mels=SPEC_SHAPE[0], 
                                              fmin=FMIN, 
                                              fmax=FMAX)
    
    mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
        
    # Normalize
    mel_spec -= mel_spec.min()
    mel_spec /= mel_spec.max()
    return mel_spec

def add_spectrograms(sample, label):
    tensor = sample["segment"]
    
    # hop_length = int(SPLIT_SECS * SR / (SPEC_SHAPE[1] - 1))
    
    spectrogram = tfio.audio.spectrogram(
        tensor, nfft=512, window=512, stride=256
    )

    mel_spectrogram = tfio.audio.melscale(
        spectrogram, rate=SR, mels=SPEC_SHAPE[0], fmin=FMIN, fmax=FMAX
    )

    dbscale_mel_spectrogram = tfio.audio.dbscale(
        mel_spectrogram, top_db=80
    )
    
    sample["spectrogram"] = tf.transpose(spectrogram)
    sample["mel_spectrogram"] = tf.transpose(mel_spectrogram)
    sample["dbscale_mel_spectrogram"] = tf.transpose(dbscale_mel_spectrogram)
    
    [mel_spec, ] = tf.py_function(make_spectrogram_numpy, [tensor], [tf.float32])
    sample["mel_spec"] = mel_spec

    return sample, label

def add_spectrogram(sample, label):
    
    sample["mel_spec"] = src.spectrogram.compute_mel_spectrogram(sample["segment"])
    
    return sample, label

spectrogram_ds = short_audio_ds.map(add_spectrogram)

rows = 3
cols = 2
fig, axes = plt.subplots(rows, cols, figsize=(16, 12))
                    
for r, (sample, _) in enumerate(spectrogram_ds.take(3)):
    axes[r][0].plot(sample["segment"].numpy())
    axes[r][1].imshow(sample["mel_spec"].numpy().T)
    axes[r][0].set(title=sample["primary_label"].numpy().decode())


In [None]:
plt.figure(figsize=(15, 7))

for i, (sample, label) in enumerate(spectrogram_ds.take(12)):
    spec = sample["mel_spec"].numpy()
    plt.subplot(3, 4, i + 1)
    label_index = np.argmax(label.numpy())
    clazz = src.dataset.tensor_to_class(label, CLASSES)
    plt.title(clazz)
    plt.imshow(spec.T, origin='lower')