In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import fastai
fastai.__version__

'1.0.37'

In [3]:
from fastai import *
from fastai_audio import *
from fastai.vision import models

In [4]:
DATA = Path('data')
NSYNTH_AUDIO = DATA/'nsynth_audio' # contains train and valid folders

In [5]:
!ls {NSYNTH_AUDIO}

[1m[34mmodels[m[m                          train_guitar_clean_40_88.csv
[1m[34mtest[m[m                            train_keyboard_clean_21_108.csv
test_guitar_clean_40_88.csv     [1m[34mvalid[m[m
test_keyboard_clean_21_108.csv  valid_guitar_clean_40_88.csv
[1m[34mtrain[m[m                           valid_keyboard_clean_21_108.csv


In [6]:
!ls tmp_labels

test_guitar_clean_40_88.csv     train_keyboard_clean_21_108.csv
test_keyboard_clean_21_108.csv  valid_guitar_clean_40_88.csv
train_guitar_clean_40_88.csv    valid_keyboard_clean_21_108.csv


In [7]:
LABEL_DIR = Path('tmp_labels')
GUITAR_TRN = LABEL_DIR/'train_guitar_clean_40_88.csv'
GUITAR_VAL = LABEL_DIR/'valid_guitar_clean_40_88.csv'
GUITAR_TST = LABEL_DIR/'test_guitar_clean_40_88.csv'

In [8]:
trn_df, val_df, tst_df = [pd.read_csv(CSV) for CSV in [GUITAR_TRN, GUITAR_VAL, GUITAR_TST]]
trn_df.head(2)

Unnamed: 0,note_str,pitch
0,guitar_acoustic_001-082-050,82
1,guitar_electronic_035-062-127,62


In [9]:
len(trn_df), len(val_df), len(tst_df)

(16915, 1112, 347)

In [10]:
trn_list, val_list, tst_list = [AudioItemList.from_df(df, path=NSYNTH_AUDIO, 
                                                      folder=folder, suffix='.wav')
                                for df, folder in zip([trn_df, val_df, tst_df], 
                                                      ['train', 'valid', 'test'])]
len(trn_list), len(val_list), len(tst_list)

(16915, 1112, 347)

In [11]:
def get_frame(x):
    start = 1024
    frame_len = 512
    return x[start:start+frame_len]

In [13]:
tfm_list = [get_frame]
tfms = (tfm_list, tfm_list) # train, valid tfms 

n_fft = 512 
n_hop = 256
n_mels = 64
sample_rate = 16000
ref = 'max'
top_db = 50.0
bs = 32

# freq_tfms = get_frequency_batch_transforms(n_fft=n_fft,
#                                            n_hop=n_hop,
#                                            n_mels=n_mels,
#                                            ref=ref,
#                                            top_db=top_db,
#                                            sample_rate=sample_rate)

data = (ItemLists(NSYNTH_AUDIO, trn_list, val_list)
            .label_from_df('pitch')
            .add_test(tst_list)
            .transform(tfms)
            .databunch(bs=bs))
xs, ys = data.one_batch()
xs.shape, ys.shape, xs.min(), xs.max()

(torch.Size([32, 512]), torch.Size([32]), tensor(-0.9259), tensor(0.8436))

In [20]:
class SimpleModel(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.layers = nn.Sequential(
            *bn_drop_lin(512, 1024, actn=nn.ReLU(inplace=True)),
            *bn_drop_lin(1024, 512, actn=nn.ReLU(inplace=True)),
            *bn_drop_lin(512, n_classes, actn=None),
        )
    def forward(self, x):
        return self.layers(x)

In [21]:
model = SimpleModel(data.c)
learn = Learner(data, model, metrics=[accuracy])
learn.summary()

Layer (type)         Output Shape         Param #    Trainable 
BatchNorm1d          [32, 512]            1024       True      
______________________________________________________________________
Linear               [32, 1024]           525312     True      
______________________________________________________________________
ReLU                 [32, 1024]           0          False     
______________________________________________________________________
BatchNorm1d          [32, 1024]           2048       True      
______________________________________________________________________
Linear               [32, 512]            524800     True      
______________________________________________________________________
ReLU                 [32, 512]            0          False     
______________________________________________________________________
BatchNorm1d          [32, 512]            1024       True      
______________________________________________________________

In [None]:
model_name = 'pitch_frame_v1'
learn.fit_one_cycle(4)
learn.save(model_name + '-stage-1')

In [None]:
learn.load(model_name + '-stage-1')
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.load(model_name + '-stage-1')
learn.fit_one_cycle(8, max_lr=1e-3)
learn.save(model_name + '-stage-2')

In [None]:
learn.load(model_name + '-stage-2')
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.load(model_name + '-stage-2')
learn.fit_one_cycle(8, max_lr=1e-4)
learn.save(model_name + '-stage-3')

In [None]:
learn.load(model_name + '-stage-3')
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.load(model_name + '-stage-3')
learn.fit_one_cycle(8, max_lr=1e-5)
learn.save(model_name + '-stage-4')

In [None]:
accuracy(*learn.get_preds())

In [None]:
n_errors = round(float(1 - accuracy(*learn.get_preds(DatasetType.Train))) * len(trn_list))
print(n_errors, 'errors')