In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import fastai
fastai.__version__

In [None]:
from fastai import *
from fastai_audio import *

from fastai.vision import models

In [None]:
DATA = Path('data')
AUDIOSET = DATA/'audioset'
TRAIN_AUDIO = AUDIOSET/'train'
TRAIN_CSV = AUDIOSET/'train_segments_cl.csv'
LABELS_CSV = AUDIOSET/'class_labels_indices.csv'

In [None]:
label_df = pd.read_csv(LABELS_CSV, index_col='mid', usecols=['mid', 'display_name'])
print(label_df.shape)
label_df.head()

In [None]:
train_df = pd.read_csv(TRAIN_CSV, sep=' ')
train_df['fname'] = train_df['YTID'] + train_df['start_seconds'].apply(lambda x: f"_{x:.3f}")
train_df['label_str'] = train_df['positive_labels'].apply(lambda s: label_df.loc[s.split(','), 
                                                                                 'display_name'].tolist())
train_df = train_df[['fname', 'label_str']]
train_df.head()

In [None]:
n_fft = 1024
n_hop = 256
n_mels = 128
sr = 22500
top_db = 60.0

bs = 128

mel_spec_tfm = Compose([
    Spectrogram(n_fft=n_fft, n_hop=n_hop),
    FrequencyToMel(n_mels=n_mels, n_fft=n_fft, sr=sr, f_min=0., f_max=None),
    SpectrumToDb(ref='max', top_db=top_db, normalized=True)
])

def inputs_tfm(inputs):
    x, y = inputs
    # 1d time domain signal to 2d log melspectrogram (frequency domain)
    x = mel_spec_tfm(x)
    # add channel dimension
    x.unsqueeze_(1)
    return x, y

tfms = [inputs_tfm]

In [None]:
data = (AudioItemList
            .from_df(train_df, path=AUDIOSET, folder='train', suffix='.wav')
            .random_split_by_pct()
            .label_from_df()
            .databunch(bs=bs, tfms=tfms, equal_lengths=True))

In [None]:
data.c, data.classes[:5]

In [None]:
learn = create_cnn(data, models.resnet18, metrics=[error_rate, mapk])
# learn = create_cnn(data, models.resnet34, metrics=[error_rate, mapk])