In [1]:
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import pandas as pd
from tqdm import tqdm

In [2]:
def label_to_num(name):
    label = name.split(".")[0].split("_")[1]
    if label == 'gui':
        return 0
    elif label == 'hi':
        return 1
    elif label == 'lau':
        return 2
    elif label == 'sax':
        return 3
    elif label == 'vio':
        return 4

class Audio_dataset(Dataset):


    def __init__(self,file_dir):


        self.file_dir = file_dir
        
        self.files = os.listdir(self.file_dir)
        
    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x = self.files[idx]
        audio = torch.from_numpy(torch.load(os.path.join(self.file_dir,x)))

        audio = audio.unfold(1,50,1) # apply a window at each time-step

        audio = audio.permute(1,0,2)

        label = label_to_num(x)
        
        return audio,label


In [3]:
log = True # log the accuracies and loss
log_w = True # log the weights

In [4]:
dataset_train = Audio_dataset(file_dir="./data/melspectrogram/train")

dataset_valid = Audio_dataset(file_dir="./data/melspectrogram/validation")
train_loader = DataLoader(dataset_train, batch_size=1,
                        shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset_valid, batch_size=1,
                        shuffle=False, num_workers=0)

train_set_size = len(dataset_train)

valid_set_size = len(dataset_valid)
print(train_set_size)
print(valid_set_size)

1500
340


In [5]:
audio,label = next(iter(train_loader))
print(audio.shape,label) #check shape of the inputs

torch.Size([1, 102, 60, 50]) tensor([1])


We will use a CNN-GRU model, alternatively you can use different models 

In [6]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.audio_encoder = nn.Sequential(
            nn.Conv2d(1, 32, 3, 1, 1, bias=True),
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Conv2d(32, 32, 3, 1, 1, bias=True),
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2),
            nn.Conv2d(32, 32, 3, 1, 1, bias=True),
            nn.ReLU(),
            nn.MaxPool2d(2, stride=2)
        )
        
        
        self.audio_linear = nn.Sequential(nn.LeakyReLU(0.2, inplace=True),
        nn.Linear(32*7*6,128))

        self.gru_hiddend_dim = 128

        self.gru = nn.GRU(128, 128, 1, batch_first=True)

        self.out = nn.Sequential(nn.Linear(128,5)) # 5 classes

    def forward(self,audio_data):


        batch_size = audio_data.size(0)

        x = self.audio_encoder(audio_data.view(-1,1,60,50))

        x = self.audio_linear(x.view(-1,32*7*6))

        h0 = torch.ones(1, batch_size, self.gru_hiddend_dim).to(device)
        
        x, _ = self.gru(x.view((batch_size,-1,128)),h0)# (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)

        x = x[:,-1,:] # take the last time step for prediction
        
        prediction = self.out(x.reshape(-1,128))
        return prediction

In [7]:
audio,label = next(iter(train_loader))
print(audio.shape,label)

torch.Size([1, 102, 60, 50]) tensor([2])


In [8]:
device = "cpu" # put everything to cpu just for now to check if model is working
model = Net()

out = model(audio)
print(out.shape) # check the output shape for some input

torch.Size([1, 5])


In [9]:
device = "cuda"

def train_model(model, optimizer, num_epochs=25):
    since = time.time()

    pbar=tqdm(range(0,num_epochs))
    for epoch in pbar: 
        # Each epoch has a training and validation phase
        model.train()  # Set model to training mode

        running_loss = 0.0
        running_corrects = 0
        
        # Iterate over data.
        for sample in train_loader:
            data ,labels = sample
            batch_size = data.size(0)
            data = data.to(device) 

            labels = labels.long().to(device)

            
            optimizer.zero_grad()
            # forward
            # track history if only in train
            with torch.set_grad_enabled(True):

                outputs = model(data.float())

                _, preds = torch.max(outputs, 1)

                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()
            # statistics
            running_loss += loss.item() * data.size(0)
            running_corrects += torch.sum(preds == labels.data)

        train_loss = running_loss / train_set_size
        train_acc = running_corrects.double() / train_set_size
        
        
        model.eval()   # Set model to evaluate mode
        
        running_loss = 0.0
        running_corrects = 0

        with torch.no_grad():
            
            for sample in valid_loader:
                data ,labels = sample

                data = data.to(device) 
                labels = labels.long().to(device)
                # forward
                # track history if only in train
                with torch.set_grad_enabled(False):
                    outputs = model(data)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                # statistics
                running_loss += loss.item() * data.size(0)
                running_corrects += torch.sum(preds == labels.data)
            
        val_loss = running_loss / valid_set_size
        val_acc = running_corrects.double() / valid_set_size

        print("train loss:{:.4f} train_acc: {:.2} val loss:{:.4f} val_acc: {:.2}".format(train_loss,train_acc,val_loss,val_acc))
        
        if log:
            log_data = {'epoch': epoch,
                't_loss': train_loss,
                't_acc':train_acc.cpu().item(),
                'v_loss':val_loss,
                'v_acc':val_acc.cpu().item(),

            }
            df = pd.DataFrame(log_data,index=[0])
            if epoch==0:
                df.to_csv(my_file,index=False,mode="a")
            else:
                df.to_csv(my_file, header=False,index=False,mode="a")
        
        if log_w:
            torch.save(model.state_dict(), save_dir + "/model{}.pth".format(epoch))
    return model

print("Device: ",device)
model = Net().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)


criterion = nn.CrossEntropyLoss() # we use cross entropy loss

file_name = "cnn_gru"
save_dir = os.path.join("./models",file_name)

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

my_file = file_name + ".csv" # we will log the accuracy and loss to a csv file

num_epochs = 10

model = train_model(model, optimizer,
                      num_epochs=num_epochs)

Device:  cuda


 10%|█         | 1/10 [00:17<02:36, 17.42s/it]

train loss:1.0617 train_acc: 0.58 val loss:0.6740 val_acc: 0.72


 20%|██        | 2/10 [00:34<02:18, 17.30s/it]

train loss:0.6335 train_acc: 0.79 val loss:0.4690 val_acc: 0.85


 30%|███       | 3/10 [00:51<02:01, 17.32s/it]

train loss:0.4648 train_acc: 0.85 val loss:0.4535 val_acc: 0.84


 40%|████      | 4/10 [01:09<01:43, 17.32s/it]

train loss:0.3648 train_acc: 0.89 val loss:0.2960 val_acc: 0.91


 50%|█████     | 5/10 [01:26<01:26, 17.32s/it]

train loss:0.2744 train_acc: 0.91 val loss:0.3683 val_acc: 0.88


 60%|██████    | 6/10 [01:44<01:09, 17.41s/it]

train loss:0.2431 train_acc: 0.92 val loss:0.2643 val_acc: 0.91


 70%|███████   | 7/10 [02:01<00:52, 17.49s/it]

train loss:0.1721 train_acc: 0.95 val loss:0.4827 val_acc: 0.84


 80%|████████  | 8/10 [02:19<00:34, 17.38s/it]

train loss:0.1399 train_acc: 0.96 val loss:0.2574 val_acc: 0.92


 90%|█████████ | 9/10 [02:36<00:17, 17.33s/it]

train loss:0.1130 train_acc: 0.97 val loss:0.2608 val_acc: 0.9


100%|██████████| 10/10 [02:53<00:00, 17.34s/it]

train loss:0.1013 train_acc: 0.97 val loss:0.2168 val_acc: 0.92





Now after the traning is finished we will evaluate the performance of our model in a better way in another notebook.
Remember that in test set samples are not equally distributed so we have calculated the unweighted accuracy for the test set during traning.
In the next notebook we will check performance of the model for each indivual class with a confusion matrix. You can try using a different model, hyper parameter to get better results. Also note that our model is overfitting. You can try solving this by adding drop out or L2-regularization.