In [74]:
import webrtcvad
from scipy.io import wavfile
import numpy as np
import librosa

import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import math
import copy
import matplotlib.pyplot as plt
%matplotlib inline
from tensorboardX import SummaryWriter


import torch.optim as optim
import pandas as pd
import numpy as np
import sys
import os
sys.path.append('/home/ilya/workspace/ESC-50')
from utils import ESC50

def filter_VAD(filename, frame_duration, frame_length):
    sample_rate, data = wavfile.read(filename)
    vad = webrtcvad.Vad(3)
    data = np.pad(data, int(frame_length // 2), mode='reflect')
    frames = librosa.util.frame(data, frame_length, hop_length=frame_length // 4)

    frame_seq = []

    for i in range(frames.shape[1]):
        if(vad.is_speech(frames[:,i].tobytes(), sample_rate)):
            frame_seq.append(i)

    voiced_frames = np.array(frame_seq)
    print('frames shape:', frames.shape, 
          'voiced_frames shape:', voiced_frames.shape)
    
    return(voiced_frames)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
# filename = '/home/ilya/workspace/ESC-50/audio/16000/1-137-A-32.wav'
# sample_rate, y = wavfile.read(filename)
# y = np.array(y,dtype=float)
# n_fft = 2048
# S = librosa.feature.melspectrogram(y, sr=sample_rate, n_mels=128,
#                                    fmax=8000, n_fft=n_fft,
#                                    hop_length=n_fft//2)
# log_S = librosa.core.power_to_db(S)
# mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
# delta_mfcc  = librosa.feature.delta(mfcc)
# delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# np.vstack((mfcc,delta_mfcc,delta2_mfcc)).shape

In [117]:
train_splits = [1,2,3,4]
test_split = 5

shared_params = {'csv_path': '/home/ilya/workspace/ESC-50/esc50.csv',
                 'wav_dir': '/home/ilya/workspace/ESC-50/audio',
                 'dest_dir': '/home/ilya/workspace/ESC-50/audio/16000',
                 'audio_rate': 16000,
                 'only_ESC10': True,
                 'pad': 0,
                 'normalize': True}

train_gen = ESC50(folds=train_splits,
                  randomize=True,
                  strongAugment=True,
                  random_crop=True,
                  inputLength=2,
                  mix=False,
                  frames=True,
                  n_fft=2048,
                  **shared_params).batch_gen(16)

test_gen = ESC50(folds=[test_split],
                 randomize=False,
                 strongAugment=False,
                 random_crop=False,
                 inputLength=4,
                 mix=False,
                 frames=True,
                 n_fft=2048,
                 **shared_params).batch_gen(2048)

eval_inputs, eval_labels = next(test_gen)
eval_inputs, eval_labels = torch.Tensor(eval_inputs).cuda(), torch.LongTensor(eval_labels).cuda()
print (eval_inputs.shape)



  b = a[a_slice]


torch.Size([2048, 39, 79])


In [80]:
class classifier_mfcc(nn.Module):
    def __init__(self, stride=1,padding=1, dilation=1, groups=1):
        super(classifier_mfcc, self).__init__()
        
        self.conv1 = nn.Conv1d(39, 15, 3)
        self.pool = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(15,20,3)
        self.conv3 = nn.Conv1d(20,25,3)
        self.lastpool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(25,120)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(120,10)
    
    def forward(self,x):

        output = self.pool(self.conv1(x))
        output = self.pool(self.conv2(output))
        output = self.conv3(output)
        output = self.lastpool(output)
        output = self.relu(self.fc1(output.squeeze()))
        output = self.fc2(output)
        
        return output

In [96]:
net_mfcc = classifier_mfcc()
net_mfcc.cuda()

classifier_mfcc(
  (conv1): Conv1d(39, 15, kernel_size=(3,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(15, 20, kernel_size=(3,), stride=(1,))
  (conv3): Conv1d(20, 25, kernel_size=(3,), stride=(1,))
  (lastpool): AdaptiveAvgPool1d(output_size=1)
  (fc1): Linear(in_features=25, out_features=120, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=120, out_features=10, bias=True)
)

In [108]:
writer = SummaryWriter('runs/mfcc_log')
criterion = nn.CrossEntropyLoss()

In [110]:
optimizer = optim.SGD(net_mfcc.parameters(), lr=0.001, momentum=0.9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7,patience=100, verbose=True, threshold=0.01)

In [None]:
i = 0
C = 0.07
l2 = 0.03
best_loss = 100
for epoch in range(100):  # loop over the dataset multiple times
    
    train_gen = ESC50(folds=train_splits,
                  randomize=True,
                  strongAugment=True,
                  random_crop=True,
                  inputLength=2,
                  mix=False,
                  frames=True,
                  n_fft=2048,
                  **shared_params).batch_gen(256)
    
    running_loss = 0.0
    for inputs, labels in train_gen:
        # get the inputs
        inputs, labels = torch.Tensor(inputs).cuda(), torch.LongTensor(labels).cuda()

        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize
        outputs = net_mfcc(inputs)

        _, labels = torch.max(labels, 1)
        
        loss = criterion(outputs, labels)
        scheduler.step(loss)
        
        loss.backward()
        optimizer.step()
        
        
        if i%100 == 0:
            
            _, predicted = torch.max(outputs, 1)
            
            out_ev = net_mfcc(eval_inputs) 
            _, labels_2 = torch.max(eval_labels, 1) 
            loss_ev = criterion(out_ev, labels_2)
            
            if loss_ev<best_loss:
                best_loss = loss_ev
                torch.save(net_mfcc.state_dict(),'best_mfcc')
                
            writer.add_scalar("loss_ev", loss_ev.item())
            writer.add_scalar("acc_ev", (out_ev.max(1)[1]==labels_2).float().mean().item())
#             print (epoch,i,loss.data,(predicted==labels).float().mean())  
            
        i+=1
#         print (net.wavelet.energy(net.wavelet.weight_lo), net.wavelet.energy(net.wavelet.weight_hi), net.wavelet.lo_Norm.L.grad, loss.data)
        
        writer.add_scalar("loss", loss.item())
        writer.add_scalar("acc", (outputs.max(1)[1]==labels).float().mean().item())

  b = a[a_slice]


Epoch   106: reducing learning rate of group 0 to 4.9000e-04.
Epoch   223: reducing learning rate of group 0 to 3.4300e-04.
