# saved models are named as FreeSound_1D_conv_

In [1]:
import os
import pickle
import pandas as pd
import librosa
import librosa.display
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import tqdm.notebook as tqdm
from torchsummary import summary
import torch.optim as optim
os.listdir('data/freesound-audio-tagging')

['audio_test',
 'audio_train',
 'free_sound_11025.pkl',
 'free_sound_22050.pkl',
 'sample_submission.csv',
 'test_post_competition.csv',
 'train.csv',
 'train_post_competition.csv']

In [2]:
len(os.listdir('data/freesound-audio-tagging/audio_train'))
df = pd.read_csv('data/freesound-audio-tagging/train.csv')
df.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [3]:
Loaded_data = {}
if os.path.exists('data/freesound-audio-tagging/free_sound_11025.pkl'):
    Loaded_data = pickle.load(open('data/freesound-audio-tagging/free_sound_11025.pkl', 'rb'))
    
def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data-0.5

def load_audio_file(file_path, input_length=4096):
    
    if file_path not in Loaded_data:
        data = librosa.core.load(file_path, sr=None) 
        data = librosa.core.resample(data[0], data[1], 11025)
        Loaded_data[file_path] = data
    else:
        data = Loaded_data[file_path]
    
    if len(data)>input_length:
        max_offset = len(data)-input_length
        offset = np.random.randint(max_offset)
        data = data[offset:input_length+offset]
        
    else:
        T = np.zeros(input_length, dtype=float)
        T[:len(data)] = data
        data = T
        #max_offset = input_length - len(data)
        ##offset = np.random.randint(max_offset)
        #data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        
    data = audio_norm(data)
    return np.array([data])

In [7]:
librosa.display.waveplot(load_audio_file('data/freesound-audio-tagging/audio_train/001ca53d.wav')[0],
                         sr=11025, 
                         max_points=50000.0, 
                         x_axis='time', 
                         offset=0.0)

<matplotlib.collections.PolyCollection at 0x257f0e8ad88>

# Data Loading

In [4]:
labels = sorted(set(df.label))
label_to_indice = {l:i for i,l in enumerate(labels)}
indice_to_label = {i:l for i,l in enumerate(labels)}

In [5]:
class FreeSoundDataset(torch.utils.data.Dataset):
    def __init__(self, df_path, data_path, train=True, split=0.8):
        
        self.df = pd.read_csv(df_path)
        
        self.df = self.df[:int(len(self.df)*split)] if train else self.df[int(len(self.df)*split):]
        
        self.data_path = data_path
        self.input_length = 4096
        self.batch_size = 32
        
        self.labels = sorted(set(self.df.label))
        self.label_to_indice = {l:i for i,l in enumerate(self.labels)}
        self.indice_to_label = {i:l for i,l in enumerate(self.labels)}
    
    def __len__(self):
        return len(self.df)-2
    
    def __getitem__(self, idx):
        file_path = self.data_path + list(df[idx: idx+1].fname)[0]
        label_indice = label_to_indice[list(df[idx: idx+1].label)[0]]
        return load_audio_file(file_path), label_indice
        

In [6]:
sr = 11025
def bandpass_filter(signal, low, high, order = 5):
    sos = butter(order, [low, high], analog = False, btype = 'band', output = 'sos')
    y = sosfilt(sos, signal)
    return y
    
def make_signal(raw_signal, nyq = sr/2):
    return_signal = np.zeros((8, self.input_length))
    return_signal[0] = raw_signal

    cut_offs = [i/nyq for i in [1, 256, 512, 1024, 2048, 4096, 8192, 11024]]
    for i in range(1, len(cut_offs), 1):
        return_signal[i] = bandpass_filter(raw_signal, cut_offs[i-1], cut_offs[i])
    return return_signal
        
    
def shuffletwo(x, y):
    rng_state = np.random.get_state()
    np.random.shuffle(x)
    np.random.set_state(rng_state)
    np.random.shuffle(y)

In [7]:
mini_batch_size = 96
FreeSoundData = FreeSoundDataset('data/freesound-audio-tagging/train.csv',
                                 'data/freesound-audio-tagging/audio_train/')
FreeSoundDataTest = FreeSoundDataset('data/freesound-audio-tagging/train.csv',
                                     'data/freesound-audio-tagging/audio_train/',
                                     train=False)
FreeSoundDataLoader = DataLoader(FreeSoundData, batch_size=mini_batch_size, shuffle=True)
FreeSoundDataTestLoader = DataLoader(FreeSoundDataTest, batch_size=mini_batch_size, shuffle=True)

In [12]:
FreeSoundData[0][0].shape

(1, 4096)

# Model

In [None]:
'''
class FreeSound_Sense(torch.nn.Module):
    
    def __init__(self):
        super(FreeSound_Sense, self).__init__()
        self.conv1d_1_16_9 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=9, padding=True)
        self.conv1d_16_16_9 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=9, padding=True)
        self.conv1d_16_32_3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=True)
        self.conv1d_32_32_3 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, padding=True)
        self.conv1d_32_64_3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=True)
        self.conv1d_64_64_3 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=True)
        
        self.maxpool_16 = nn.MaxPool1d(16)
        self.maxpool_8 = nn.MaxPool1d(8)
        self.maxpool_4 = nn.MaxPool1d(4)
        
        self.relu = nn.ReLU()
        self.sigm = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.1)
        
        self.fc_64_64 = nn.Linear(in_features=64, out_features=64)
        self.fc_64_512 = nn.Linear(in_features=64, out_features=512)
        self.fc_512_42 = nn.Linear(in_features=512, out_features=42)
        
        
    def forward(self, x):
        
        # First Block
        x = self.conv1d_1_16_9(x)
        x = self.relu(x)
        x = self.conv1d_16_16_9(x)
        x = self.relu(x)
        x = self.maxpool_16(x)
        x = self.dropout(x)
        
        # Second Block
        x = self.conv1d_16_32_3(x)
        x = self.relu(x)
        x = self.conv1d_32_32_3(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = self.dropout(x)
        
        # Third Block
        x = self.conv1d_32_32_3(x)
        x = self.relu(x)
        x = self.conv1d_32_32_3(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = self.dropout(x)
        
        # Fourth Block
        x = self.conv1d_32_64_3(x)
        x = self.relu(x)
        x = self.conv1d_64_64_3(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = torch.mean(x, 2)
 
        # Final Layers
        x = torch.flatten(x, start_dim=1)
        x = self.fc_64_64(x)
        x = self.relu(x)
        x = self.fc_64_512(x)
        x = self.relu(x)
        x = self.fc_512_42(x)
        x = self.softmax(x)
        
        return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model = FreeSound_Sense()
Model.float()
Model.to(device)
summary(Model, (1, 4096))

from thop import profile
macs, params = profile(Model, inputs=(torch.randn(1, 1, 4096).to(device), ))
macs, params
'''

In [33]:
class FreeSound_Sense(torch.nn.Module):
    
    def __init__(self):
        super(FreeSound_Sense, self).__init__()
        self.conv1d_1_16_9 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=9)
        self.conv1d_16_16_9 = nn.Conv1d(in_channels=16, out_channels=16, kernel_size=9)
        self.conv1d_16_32_3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3)
        self.conv1d_32_32_3_1 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3)
        self.conv1d_32_32_3_2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3)
        self.conv1d_32_32_3_3 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3)
        self.conv1d_32_64_3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)
        self.conv1d_64_64_3 = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3)
        
        self.maxpool_16 = nn.MaxPool1d(16)
        self.maxpool_8 = nn.MaxPool1d(8)
        self.maxpool_4 = nn.MaxPool1d(4)
        
        self.relu = nn.ReLU()
        self.sigm = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.1)
        
        self.fc_64_64 = nn.Linear(in_features=64, out_features=64)
        self.fc_64_512 = nn.Linear(in_features=64, out_features=512)
        self.fc_512_42 = nn.Linear(in_features=512, out_features=42)
        
        
    def forward(self, x):
        
        # First Block
        x = self.conv1d_1_16_9(x)
        x = self.relu(x)
        x = self.conv1d_16_16_9(x)
        x = self.relu(x)
        x = self.maxpool_16(x)
        x = self.dropout(x)
        
        # Second Block
        x = self.conv1d_16_32_3(x)
        x = self.relu(x)
        x = self.conv1d_32_32_3_1(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = self.dropout(x)
        
        # Third Block
        x = self.conv1d_32_32_3_2(x)
        x = self.relu(x)
        x = self.conv1d_32_32_3_3(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = self.dropout(x)
        
        # Fourth Block
        x = self.conv1d_32_64_3(x)
        x = self.relu(x)
        x = self.conv1d_64_64_3(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = torch.mean(x, 2)

        # Final Layers
        x = torch.flatten(x, start_dim=1)
        x = self.fc_64_64(x)
        x = self.relu(x)
        x = self.fc_64_512(x)
        x = self.relu(x)
        x = self.fc_512_42(x)
        x = self.softmax(x)
        
        return x
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model = FreeSound_Sense()
Model.float()
Model.to(device)
summary(Model, (1, 4096))

from thop import profile
macs, params = profile(FreeSound_Sense().to(device), inputs=(torch.randn(1, 1, 4096).to(device), ))
macs, params

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1             [-1, 16, 4088]             160
              ReLU-2             [-1, 16, 4088]               0
            Conv1d-3             [-1, 16, 4080]           2,320
              ReLU-4             [-1, 16, 4080]               0
         MaxPool1d-5              [-1, 16, 255]               0
           Dropout-6              [-1, 16, 255]               0
            Conv1d-7              [-1, 32, 253]           1,568
              ReLU-8              [-1, 32, 253]               0
            Conv1d-9              [-1, 32, 251]           3,104
             ReLU-10              [-1, 32, 251]               0
        MaxPool1d-11               [-1, 32, 62]               0
          Dropout-12               [-1, 32, 62]               0
           Conv1d-13               [-1, 32, 60]           3,104
             ReLU-14               [-1,

In [34]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(Model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(Model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

In [None]:
epoch_progress_bar = tqdm.tqdm(range(0, 500))
for epoch in epoch_progress_bar:
    avg_epoch_loss = 0
    data_progress_bar = tqdm.tqdm(FreeSoundDataLoader)
    positives=0
    for data, targets in data_progress_bar:
        data = data.float().to(device)
        targets = targets.long().to(device)
        

        optimizer.zero_grad()
        outputs = Model(data)
        

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        avg_epoch_loss+=loss_val
        data_progress_bar.set_description(desc="Loss: "+str(loss_val))
        
        outputs = np.argmax(outputs.detach().cpu().numpy(), axis=1)
        targets = targets.cpu().numpy()
        positives += np.sum(targets==outputs)
    
    print('Epoch Loss: ', str(avg_epoch_loss/len(FreeSoundDataLoader)))
    print('Train Acc ', str(positives*100/(len(FreeSoundDataLoader)*mini_batch_size)))
    
    # Validation
    data_test_progress_bar = tqdm.tqdm(FreeSoundDataTestLoader)
    positives=0
    for data, targets in data_test_progress_bar:
        data = data.float().to(device)
        targets = targets.numpy()
        outputs = Model(data)
        outputs = np.argmax(outputs.detach().cpu().numpy(), axis=1)
        positives += np.sum(targets==outputs)
        
    print('Valid Acc ', str(positives*100/(len(FreeSoundDataTestLoader)*mini_batch_size)))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.7344435589222966
Train Acc  3.059071729957806


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  2.9166666666666665


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.7266770857798903
Train Acc  2.979957805907173


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  2.9166666666666665


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.7171148831331275
Train Acc  3.850210970464135


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  5.729166666666667


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.7099015923995005
Train Acc  6.47415611814346


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  7.135416666666667


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6991743800006334
Train Acc  7.093881856540085


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.854166666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.68970866142949
Train Acc  8.452004219409282


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  9.322916666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6899076413504686
Train Acc  8.399261603375528


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.958333333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6850461084631423
Train Acc  8.900316455696203


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  9.583333333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.683735382707813
Train Acc  9.071729957805907


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.489583333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6796279044090947
Train Acc  9.572784810126583


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.697916666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6905860719801504
Train Acc  8.082805907172995


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.072916666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.686921720263324
Train Acc  8.70253164556962


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.854166666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.682556562785861
Train Acc  9.203586497890296


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.59375


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.678787557384636
Train Acc  9.717827004219409


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  9.427083333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6764035285273686
Train Acc  9.691455696202532


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  8.177083333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6791800999943214
Train Acc  9.546413502109704


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  9.270833333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6732315956791743
Train Acc  10.271624472573839


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  9.010416666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.670765641369397
Train Acc  10.429852320675106


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  10.729166666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.669883227046532
Train Acc  10.495780590717299


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  10.520833333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.664170014707348
Train Acc  11.023206751054852


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  10.46875


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.668742496756059
Train Acc  10.469409282700422


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  11.40625


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.669775685177574
Train Acc  10.271624472573839


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  10.833333333333334


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


Epoch Loss:  3.6623015645184096
Train Acc  11.102320675105485


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


Valid Acc  11.354166666666666


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))

In [31]:
torch.save(Model.state_dict(), "model_weights/FreeSound_1D_conv_smaller_0_epoch.stDict")

In [32]:
M2 = FreeSound_Sense()
M2.load_state_dict(torch.load("model_weights/FreeSound_1D_conv_smaller_0_epoch.stDict"))

<All keys matched successfully>

In [174]:
i=0
for data in FreeSoundDataLoader:
    if i==4:
        O = Model(data[0].float().to(device))
        A = data[1]
        #print(data[1])
        break
    i+=1
O = O.detach().cpu().numpy()
K = np.argmax(O, axis=1)==A.numpy()
print(np.sum(K)/len(K), np.sum(K))
list(zip(A, K))

0.3125 10


[(tensor(0), False),
 (tensor(3), True),
 (tensor(12), False),
 (tensor(25), False),
 (tensor(15), True),
 (tensor(20), False),
 (tensor(4), False),
 (tensor(12), False),
 (tensor(34), False),
 (tensor(13), False),
 (tensor(24), False),
 (tensor(26), True),
 (tensor(7), False),
 (tensor(22), False),
 (tensor(40), True),
 (tensor(20), True),
 (tensor(17), True),
 (tensor(32), True),
 (tensor(7), False),
 (tensor(24), False),
 (tensor(29), False),
 (tensor(10), False),
 (tensor(38), True),
 (tensor(34), False),
 (tensor(30), True),
 (tensor(17), False),
 (tensor(26), True),
 (tensor(18), False),
 (tensor(12), False),
 (tensor(17), False),
 (tensor(1), False),
 (tensor(39), False)]