In [1]:
import os
import pandas as pd
import librosa
import librosa.display
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import tqdm.notebook as tqdm
from torchsummary import summary
import torch.optim as optim
os.listdir('data/freesound-audio-tagging')

['audio_test',
 'audio_train',
 'sample_submission.csv',
 'test_post_competition.csv',
 'train.csv',
 'train_post_competition.csv']

In [2]:
len(os.listdir('data/freesound-audio-tagging/audio_train'))
df = pd.read_csv('data/freesound-audio-tagging/train.csv')
df.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [3]:
sr = 44100
input_length = int(sr/2)
batch_size = 32


def audio_norm(data):
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+1e-6)
    return data-0.5


def load_audio_file(file_path, input_length=input_length):
    data = librosa.core.load(file_path, sr=sr)[0] 
    
    if len(data)>input_length:
        max_offset = len(data)-input_length
        offset = np.random.randint(max_offset)
        data = data[offset:input_length+offset]
        
    else:
        T = np.zeros(input_length, dtype=float)
        T[:len(data)] = data
        data = T
        #max_offset = input_length - len(data)
        ##offset = np.random.randint(max_offset)
        #data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        
    data = audio_norm(data)
    return np.array([data])

In [4]:
librosa.display.waveplot(load_audio_file('data/freesound-audio-tagging/audio_train/001ca53d.wav')[0],
                         sr=sr, 
                         max_points=50000.0, 
                         x_axis='time', 
                         offset=0.0)

<matplotlib.collections.PolyCollection at 0x1602a2b3f48>

# Data Loading

In [5]:
labels = sorted(set(df.label))
label_to_indice = {l:i for i,l in enumerate(labels)}
indice_to_label = {i:l for i,l in enumerate(labels)}

In [6]:
class FreeSoundDataset(torch.utils.data.Dataset):
    def __init__(self, df_path, data_path, train=True, split=0.8):
        
        self.df = pd.read_csv(df_path)
        
        self.df = self.df[:int(len(self.df)*split)] if train else self.df[int(len(self.df)*split):]
        
        self.data_path = data_path
        self.sr = 44100
        self.input_length = int(sr/2)
        self.batch_size = 32
        
        self.labels = sorted(set(self.df.label))
        self.label_to_indice = {l:i for i,l in enumerate(self.labels)}
        self.indice_to_label = {i:l for i,l in enumerate(self.labels)}
    
    def __len__(self):
        return len(self.df)-2
    
    def __getitem__(self, idx):
        file_path = self.data_path + list(df[idx: idx+1].fname)[0]
        label_indice = label_to_indice[list(df[idx: idx+1].label)[0]]
        return load_audio_file(file_path), label_indice
        

In [7]:
def bandpass_filter(signal, low, high, order = 5):
    sos = butter(order, [low, high], analog = False, btype = 'band', output = 'sos')
    y = sosfilt(sos, signal)
    return y
    
def make_signal(raw_signal, nyq = sr/2):
    return_signal = np.zeros((8, self.input_length))
    return_signal[0] = raw_signal

    cut_offs = [i/nyq for i in [1, 256, 512, 1024, 2048, 4096, 8192, 11024]]
    for i in range(1, len(cut_offs), 1):
        return_signal[i] = bandpass_filter(raw_signal, cut_offs[i-1], cut_offs[i])
    return return_signal
        
    
def shuffletwo(x, y):
    rng_state = np.random.get_state()
    np.random.shuffle(x)
    np.random.set_state(rng_state)
    np.random.shuffle(y)

In [26]:
FreeSoundData = FreeSoundDataset('data/freesound-audio-tagging/train.csv',
                                 'data/freesound-audio-tagging/audio_train/')
FreeSoundDataTest = FreeSoundDataset('data/freesound-audio-tagging/train.csv',
                                     'data/freesound-audio-tagging/audio_train/',
                                     train=False)
FreeSoundDataLoader = DataLoader(FreeSoundData, batch_size=64, shuffle=True)
FreeSoundDataTestLoader = DataLoader(FreeSoundDataTest, batch_size=64, shuffle=32)

# Model

In [27]:
class FreeSound_Sense(torch.nn.Module):
    
    def __init__(self):
        super(FreeSound_Sense, self).__init__()
        padding_k_9 = int((9-1)/2)
        padding_k_3 = int((3-1)/2)
        
        # First Block
        self.conv1d_1_16_9 = nn.Conv1d(in_channels=1, out_channels=16, 
                                       kernel_size=9, padding=padding_k_9)
        self.conv1d_16_16_9 = nn.Conv1d(in_channels=16, out_channels=16, 
                                        kernel_size=9, padding=padding_k_9)
        
        # Second Block
        self.conv1d_16_16_3 = nn.Conv1d(in_channels=16, out_channels=16, 
                                        kernel_size=3, padding=padding_k_3)
        
        # Third Block
        self.conv1d_32_32_3 = nn.Conv1d(in_channels=32, out_channels=32, 
                                        kernel_size=3, padding=padding_k_3)
        
        # Fourth Block
        self.conv1d_64_64_3 = nn.Conv1d(in_channels=64, out_channels=64, 
                                        kernel_size=3, padding=padding_k_3)
        
        # Fifth Block
        self.conv1d_128_128_3 = nn.Conv1d(in_channels=128, out_channels=128, 
                                        kernel_size=3, padding=padding_k_3)
        
        # Sixth Block
        self.conv1d_256_256_3 = nn.Conv1d(in_channels=256, out_channels=256, 
                                        kernel_size=3, padding=padding_k_3)
        
        
        '''self.conv1d_16_32_3 = nn.Conv1d(in_channels=16, out_channels=32, 
                                        kernel_size=3, padding=True)
        self.conv1d_32_32_3 = nn.Conv1d(in_channels=32, out_channels=32, 
                                        kernel_size=3, padding=True)
        self.conv1d_32_256_3 = nn.Conv1d(in_channels=32, out_channels=256, 
                                         kernel_size=3, padding=True)
        self.conv1d_256_256_3 = nn.Conv1d(in_channels=256, out_channels=256, 
                                          kernel_size=3, padding=True)'''
        
        self.maxpool_16 = nn.MaxPool1d(16)
        self.maxpool_8 = nn.MaxPool1d(8)
        self.maxpool_4 = nn.MaxPool1d(4)
        self.maxpool_2 = nn.MaxPool1d(2)
        
        self.relu = nn.ReLU()
        self.sigm = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)
        self.dropout_0_1 = nn.Dropout(0.1)
        
        self.batchnorm_16 = nn.BatchNorm1d(16)
        self.batchnorm_32 = nn.BatchNorm1d(32)
        self.batchnorm_64 = nn.BatchNorm1d(64)
        self.batchnorm_128 = nn.BatchNorm1d(128)
        self.batchnorm_256 = nn.BatchNorm1d(256)
        
        
        self.fc_512_64 = nn.Linear(in_features=512, out_features=64)
        self.fc_64_1024 = nn.Linear(in_features=64, out_features=1024)
        self.fc_1024_42 = nn.Linear(in_features=1024, out_features=42)
        
        
    def forward(self, x):
        
        in_ = x
        # First Block
        x = self.conv1d_1_16_9(x)
        x = self.batchnorm_16(x)
        x = self.relu(x)
        x = self.conv1d_16_16_9(x)
        x = self.batchnorm_16(x)
        x = self.relu(x)
        x = self.maxpool_4(x)
        x = self.dropout_0_1(x)
        First = x
        
        
        # Second Block
        x = self.conv1d_16_16_3(x)
        x = self.batchnorm_16(x)
        x = self.relu(x)
        x = self.conv1d_16_16_3(x)
        x = self.batchnorm_16(x)
        x = self.relu(x)
        
        x = torch.cat((First, x), 1)
        x = self.maxpool_4(x)
        x = self.dropout_0_1(x)
        Second = x
        
        
        # Third Block
        x = self.conv1d_32_32_3(x)
        x = self.batchnorm_32(x)
        x = self.relu(x)
        x = self.conv1d_32_32_3(x)
        x = self.batchnorm_32(x)
        x = self.relu(x)
        
        x = torch.cat((Second, x), 1)
        x = self.maxpool_4(x)
        x = self.dropout_0_1(x)
        Third = x
        
        
        # Fourth Block
        x = self.conv1d_64_64_3(x)
        x = self.batchnorm_64(x)
        x = self.relu(x)
        x = self.conv1d_64_64_3(x)
        x = self.batchnorm_64(x)
        x = self.relu(x)
        
        x = torch.cat((Third, x), 1)
        x = self.maxpool_4(x)
        x = self.dropout_0_1(x)
        Fourth = x
        
        
        # Fifth Block
        x = self.conv1d_128_128_3(x)
        x = self.batchnorm_128(x)
        x = self.relu(x)
        x = self.conv1d_128_128_3(x)
        x = self.batchnorm_128(x)
        x = self.relu(x)
        
        x = torch.cat((Fourth, x), 1)
        x = self.maxpool_2(x)
        x = self.dropout_0_1(x)
        Fifth = x
        
        
        # Sixth Block
        x = self.conv1d_256_256_3(x)
        x = self.batchnorm_256(x)
        x = self.relu(x)
        x = self.conv1d_256_256_3(x)
        x = self.batchnorm_256(x)
        x = self.relu(x)
        
        x = torch.cat((Fifth, x), 1)
        x = self.maxpool_2(x)
        x = self.dropout_0_1(x)
        
        x = torch.mean(x, 2)
        #print(x.shape)
        # Final Layers
        x = torch.flatten(x, start_dim=1)
        x = self.fc_512_64(x)
        x = self.relu(x)
        x = self.fc_64_1024(x)
        x = self.relu(x)
        x = self.fc_1024_42(x)
        x = self.softmax(x)
        
        return x

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Model = FreeSound_Sense()
Model.float()
Model.to(device)
summary(Model, (1, 22050))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1            [-1, 16, 22050]             160
       BatchNorm1d-2            [-1, 16, 22050]              32
              ReLU-3            [-1, 16, 22050]               0
            Conv1d-4            [-1, 16, 22050]           2,320
       BatchNorm1d-5            [-1, 16, 22050]              32
              ReLU-6            [-1, 16, 22050]               0
         MaxPool1d-7             [-1, 16, 5512]               0
           Dropout-8             [-1, 16, 5512]               0
            Conv1d-9             [-1, 16, 5512]             784
      BatchNorm1d-10             [-1, 16, 5512]              32
             ReLU-11             [-1, 16, 5512]               0
           Conv1d-12             [-1, 16, 5512]             784
      BatchNorm1d-13             [-1, 16, 5512]              32
             ReLU-14             [-1, 1

In [29]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
Model = FreeSound_Sense()
Model.float()
Model.to(device)

FreeSound_Sense(
  (conv1d_1_16_9): Conv1d(1, 16, kernel_size=(9,), stride=(1,), padding=(4,))
  (conv1d_16_16_9): Conv1d(16, 16, kernel_size=(9,), stride=(1,), padding=(4,))
  (conv1d_16_16_3): Conv1d(16, 16, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv1d_32_32_3): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv1d_64_64_3): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv1d_128_128_3): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv1d_256_256_3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (maxpool_16): MaxPool1d(kernel_size=16, stride=16, padding=0, dilation=1, ceil_mode=False)
  (maxpool_8): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (maxpool_4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (maxpool_2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ReLU()
  (sigm): Sigmoid()
  (softmax): Soft

In [30]:
criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(Model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(Model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

In [None]:
epoch_progress_bar = tqdm.tqdm(range(0, 80))
for epoch in epoch_progress_bar:
    avg_epoch_loss = 0
    data_progress_bar = tqdm.tqdm(FreeSoundDataLoader)
    positives=0
    for data, targets in data_progress_bar:
        data = data.float().to(device)
        targets = targets.long().to(device)
        

        optimizer.zero_grad()
        outputs = Model(data)
        

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        avg_epoch_loss+=loss_val
        data_progress_bar.set_description(desc="Loss: "+str(loss_val))
        
        outputs = np.argmax(outputs.detach().cpu().numpy(), axis=1)
        targets = targets.cpu().numpy()
        positives += np.sum(targets==outputs)
    
    print('Epoch Loss: ', str(avg_epoch_loss/len(FreeSoundDataLoader)))
    print('Train Acc ', str(positives*100/(len(FreeSoundDataLoader)*32)))
    
    # Validation
    data_test_progress_bar = tqdm.tqdm(FreeSoundDataTestLoader)
    positives=0
    for data, targets in data_test_progress_bar:
        data = data.float().to(device)
        targets = targets.numpy()
        outputs = Model(data)
        outputs = np.argmax(outputs.detach().cpu().numpy(), axis=1)
        positives += np.sum(targets==outputs)
        
    print('Valid Acc ', str(positives*100/(len(FreeSoundDataTestLoader)*32)))

HBox(children=(IntProgress(value=0, max=80), HTML(value='')))

HBox(children=(IntProgress(value=0, max=119), HTML(value='')))

In [32]:
torch.save(Model.state_dict(), "FreeSound_1D_conv_global_pool_deep_batchnorm_20_epoch.stDict")

In [43]:
i=0
for data in FreeSoundDataLoader:
    if i==4:
        O = Model(data[0].float().to(device))
        A = data[1]
        #print(data[1])
        break
    i+=1
O = O.detach().cpu().numpy()
K = np.argmax(O, axis=1)==A.numpy()
print(np.sum(K)/len(K), np.sum(K))
list(zip(A, K))

0.40625 13


[(tensor(36), True),
 (tensor(22), False),
 (tensor(27), False),
 (tensor(40), False),
 (tensor(1), False),
 (tensor(37), False),
 (tensor(25), True),
 (tensor(6), True),
 (tensor(25), False),
 (tensor(34), False),
 (tensor(10), False),
 (tensor(8), True),
 (tensor(11), False),
 (tensor(25), False),
 (tensor(15), True),
 (tensor(3), True),
 (tensor(10), True),
 (tensor(23), False),
 (tensor(39), False),
 (tensor(10), True),
 (tensor(30), True),
 (tensor(36), False),
 (tensor(17), False),
 (tensor(30), True),
 (tensor(12), True),
 (tensor(39), True),
 (tensor(18), False),
 (tensor(5), False),
 (tensor(30), True),
 (tensor(8), False),
 (tensor(4), False),
 (tensor(30), False)]