In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import fastai
fastai.__version__

'1.0.37'

In [3]:
from fastai import *
from fastai_audio import *
from utils import *

In [4]:
DATA = Path('data')
AUDIO = DATA/'freesound/audio_22050_trimmed'
CSV = DATA/'freesound/audio_44KHz/train_with_lens.csv'
TEST_CSV = DATA/'freesound/audio_44KHz/test_with_lens.csv'

In [5]:
df = pd.read_csv(CSV)
if df['fname'].iloc[0].endswith('.wav'):
    df['fname'] = df['fname'].str[:-4]
    df.drop(['manually_verified'], axis=1, inplace=True)
df.head()

Unnamed: 0,fname,label,n_samples
0,00044347,Hi-hat,617400
1,001ca53d,Saxophone,455112
2,002d256b,Trumpet,19404
3,0033e230,Glockenspiel,352800
4,00353774,Cello,199332


In [6]:
n_fft = 512
n_hop = 256
n_mels = 128
sample_rate = 22050
top_db = 50.0
ref = 1.0

bs = 8

trn_batch_tfms = get_frequency_batch_transforms(
    n_fft=n_fft, n_hop=n_hop, n_mels=n_mels, 
    sample_rate=sample_rate, rand_hop_pct=0.15)

val_batch_tfms = get_frequency_batch_transforms(
    n_fft=n_fft, n_hop=n_hop, n_mels=n_mels, 
    sample_rate=sample_rate)

tfms = get_transforms(min_len=n_fft)

In [10]:
data = (AudioItemList
            .from_df(df, path=AUDIO, folder='train', suffix='.wav')
            .random_split_by_pct()
            .label_from_df()
            .transform(tfms)
            .databunch(bs=bs, equal_lengths=False, length_col=2))
len(data.train_ds), len(data.valid_ds)

(7579, 1894)

In [11]:
data.train_dl.add_tfm(trn_batch_tfms[0])
data.valid_dl.add_tfm(val_batch_tfms[0])

In [12]:
res_block(32)

SequentialEx(
  (layers): ModuleList(
    (0): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): ReLU(inplace)
      (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): ReLU(inplace)
      (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (2): MergeLayer()
  )
)

In [18]:
class AudioResNet(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        n_filters = [16, 32, 64, 128, 256, 512, 1024]
        layers = [conv2d(1, n_filters[0])]
        for in_filters, out_filters in zip(n_filters[:-1], n_filters[1:]):
            layers += [res_block(in_filters),
                       conv2d(in_filters, out_filters, stride=2)]
        layers += [PoolFlatten(),
                   *bn_drop_lin(n_filters[-1], 1024, p=0.25),
                   *bn_drop_lin(1024,     n_classes, p=0.5)]
        self.layers = nn.Sequential(*layers)
            
    def forward(self, x):
        return self.layers(x)

In [19]:
model = AudioResNet(data.c)

In [20]:
learn = Learner(data, model)
learn.summary()

Layer (type)         Output Shape         Param #    Trainable 
Conv2d               [8, 16, 128, 2248]   144        True      
______________________________________________________________________
Conv2d               [8, 16, 128, 2248]   2304       True      
______________________________________________________________________
ReLU                 [8, 16, 128, 2248]   0          False     
______________________________________________________________________
BatchNorm2d          [8, 16, 128, 2248]   32         True      
______________________________________________________________________
Conv2d               [8, 16, 128, 2248]   2304       True      
______________________________________________________________________
ReLU                 [8, 16, 128, 2248]   0          False     
______________________________________________________________________
BatchNorm2d          [8, 16, 128, 2248]   32         True      
______________________________________________________________

In [24]:
cut = -7
learn = create_audio_cnn(data, model, cut)
learn.summary()

Layer (type)         Output Shape         Param #    Trainable 
Conv2d               [8, 16, 128, 2360]   144        False     
______________________________________________________________________
Conv2d               [8, 16, 128, 2360]   2304       False     
______________________________________________________________________
ReLU                 [8, 16, 128, 2360]   0          False     
______________________________________________________________________
BatchNorm2d          [8, 16, 128, 2360]   32         True      
______________________________________________________________________
Conv2d               [8, 16, 128, 2360]   2304       False     
______________________________________________________________________
ReLU                 [8, 16, 128, 2360]   0          False     
______________________________________________________________________
BatchNorm2d          [8, 16, 128, 2360]   32         True      
______________________________________________________________