# Genre Classification Using Recurrent Neural Networks

In [11]:
import pandas as pd
import numpy as np
from random import shuffle

import torch
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.nn import functional as F
from torch import optim
from torch.autograd import Variable


import copy
import time

from model import SequenceClassification
from utils import *
from train_classifier import train_model

import os

# Dataset Class

- The Data loader class helps in iterating through dataset in the training loops and allow infinite loop over the dataset
- During each iteration, the dataset returns a tuple of torch.tensor object for temporal inputs, audio inputs, and target varibale
- The target varbiale is processed with one hot encoder from scikit-learn preprocessing 

In [3]:
class AudioDatasetClassification(Dataset):

    def __init__(self, ids):

        self.temporal_features = pd.read_csv('data/Temporal_Features.csv')
        self.temporal_features = normalize_temporal_features(
            self.temporal_features)

        self.audio_features = pd.read_csv(
            'data/Audio_Features.csv').iloc[:, 1:]
        self.audio_features = normalize_df(self.audio_features)

        self.track_ids = ids
        self.track_info = pd.read_csv('data/trackinfo.csv').iloc[:, 1:]
        self.encoder = OneHotEncoder()
        self.encoder.fit(self.track_info['genre_top'].values.reshape(-1, 1))
        self.track_info = self.track_info[self.track_info['trackID'].isin(
            self.track_ids)][['trackID', 'genre_top']]


    def __len__(self):
        return len(self.track_ids)

    def __getitem__(self, idx):
        track_ids = self.track_ids[idx]
        if isinstance(track_ids, np.int64):
            track_ids = np.array([track_ids])
        sample_temporal_features = self.temporal_features[self.temporal_features['trackID'].isin(
            track_ids)].drop(labels='trackID', axis=1)
        sample_audio_features = self.audio_features[self.audio_features['trackID'].isin(
            track_ids)].drop(labels='trackID', axis=1)
        sample_target = self.track_info[self.track_info['trackID'].isin(
            track_ids)].drop(labels='trackID', axis=1)

        sample_temporal_features = torch.tensor(
            sample_temporal_features.values).reshape(224, 1)
        sample_audio_features = torch.tensor(sample_audio_features.values)
        sample_target = self.encoder.transform(sample_target.values).toarray()
        sample_target = torch.tensor(sample_target)

        return (sample_temporal_features, sample_audio_features), sample_target, track_ids

# Model

- The model is implemented using PyTorch
- The network has two branches.
- The recurrent layers (Bi-directional LSTM cell) process the temporal sequence and outputs the last output of the LSTM cell.
- Bi-directional LSTM cell process the sequence from left and right in order to fix the common issue in reccurent network which is forgetting earlier signals in the sequence
- The Audio features go through a fully connected layer
- The Bi-LSTM and fully connected output is concatenated and passed through a second fully connected layer 
- The dropout in this layers prevents over-fitting
- The output has 16 nodes and the output logits are used for calculating cross entropy loss

In [17]:
class SequenceClassification(nn.Module):

    def __init__(self, batch_size, hidden_size=64, num_layers=2):

        super(SequenceClassification, self).__init__()

        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.BiLSTM = nn.LSTM(input_size=1,
                              hidden_size=self.hidden_size,
                              num_layers=self.num_layers,
                              dropout=0.5,
                              bidirectional=True)

        self.fc1 = nn.Linear(8, self.hidden_size)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(self.hidden_size + self.hidden_size * 2, self.hidden_size)
        self.fc3 = nn.Linear(self.hidden_size, 16)

        self.dropout2 = nn.Dropout(0.5)

    def forward(self, inputs):
        temporal_features, audio_feature = inputs
        temporal_features = temporal_features.permute(1, 0, 2).float()
        audio_features = audio_feature.float()

        h0 = Variable(torch.zeros(self.num_layers * 2,
                                  self.batch_size, self.hidden_size)).float()
        c0 = Variable(torch.zeros(self.num_layers * 2,
                                  self.batch_size, self.hidden_size)).float()


        lstm_output, (final_h0, final_c0) = self.BiLSTM(
            temporal_features, None)

        lstm_final_output = lstm_output[-1]

        x = F.relu(self.fc1(audio_features))
        x = self.dropout1(x)

        concat_featurs = torch.cat([lstm_final_output, x.reshape(-1, self.hidden_size)], dim=1)

        x = F.relu(self.fc2(concat_featurs))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x


# Training Loop

- The training loop given number of epochs and saves the models weights for best accuracy over validation dataset.
- The loss criterion is Cross Entorpy for multi-class classification 
- The optimizer is Stochiastic Gradient Descent optimizer with exponential scheduler which decreases the learning exponentially during training
- Other auxiliary functions in help in inference on the test set and calculating test set accuaracy.

In [15]:
def train_model(model, criterion, optimizer, scheduler, num_epochs, dataloaders):

    if os.path.isfile('classification_tained_model.pth'):
        checkpoint = torch.load('classification_tained_model.pth')
        model.load_state_dict(checkpoint)

    since = time.time()

    loss_df = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)


        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()

            else:
                model.eval()


            running_loss = 0.0
            running_corrects = 0


            for inputs, target, ids in dataloaders[phase]:


                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    temporal_data, audio_data = inputs
                    temporal_data = temporal_data.to(device)
                    audio_data = audio_data.to(device)

                    outputs = model((temporal_data, audio_data))
                    _, preds = torch.max(outputs, 1)
                    outputs = outputs.squeeze()
                    _, target = torch.max(target.squeeze(), 1)
                    loss = criterion(outputs, target)


                    if phase == 'train':
                        loss.backward()
                        optimizer.step()


                running_loss += loss.item() * inputs[0].size(0)
                running_corrects += torch.sum(preds == target.data)


            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            loss_df[phase + '_loss'].append(epoch_loss)
            loss_df[phase + '_acc'].append(epoch_acc.item())


            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))


            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))


    loss_df = pd.DataFrame(loss_df)
    ax = loss_df[['train_loss', 'val_loss']].plot(figsize=(15, 8))
    ax.set_ylabel('Loss')
    fig = ax.get_figure()
    fig.savefig('loss.pdf', dpi=300)

    loss_df = pd.DataFrame(loss_df)
    ax = loss_df[['train_acc', 'val_acc']].plot(figsize=(15, 8))
    ax.set_ylabel('Acc')
    fig = ax.get_figure()
    fig.savefig('Acc.pdf', dpi=300)

    model.load_state_dict(best_model_wts)

    return model

In [6]:
class PDF(object):
    def __init__(self, pdf, size=(200,200)):
    self.pdf = pdf
    self.size = size

    def _repr_html_(self):
    return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

    def _repr_latex_(self):
    return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)

# Training/Validation loss

In [7]:
PDF('loss.pdf', size=(1000, 800))

# Training/Validation Accuracy

In [11]:
PDF('Acc.pdf', size=(1000, 800))

# Conclusion

- Training/validation loss difference shows that the model is overfitting 
- Adding regularization or increasing dropout rate can reduce the gap between training validtion loss (time consuming)


# Training Illustration

## Loading Dataframes and dataloaders

In [13]:
BATCH_SIZE = 128

temp_feature_df = pd.read_csv('data/Temporal_Features.csv')
audio_df = pd.read_csv('data/Audio_Features.csv')
audio_df = audio_df.iloc[:, 1:]
info_df = pd.read_csv('data/trackinfo.csv').iloc[:, 1:]


track_ids = audio_df['trackID'].values
shuffle(track_ids)
len_ids = len(track_ids)

train_ids = track_ids[np.arange(len_ids) < int(0.6 * len_ids)]
val_ids = track_ids[(np.arange(len_ids) > int(0.6 * len_ids)) & (np.arange(len_ids) < int(0.9 * len_ids))]
test_ids = track_ids[np.arange(len_ids) > int(0.9 * len_ids)]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
train_dataset = AudioDatasetClassification(train_ids)
val_dataset = AudioDatasetClassification(val_ids)
test_dataset = AudioDatasetClassification(test_ids)


dataloaders = {'train': DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True),
              'val': DataLoader(val_dataset, batch_size=BATCH_SIZE, drop_last=True),
              'test': DataLoader(test_dataset, batch_size=1, drop_last=True, shuffle=False)}

dataset_sizes = {'train': len(train_dataset),
                 'val': len(val_dataset),
                 'test': len(test_dataset)}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


## Instantiating model, loss criterion, optimizer and learning rate scheduler

In [19]:
model = SequenceClassification(batch_size=BATCH_SIZE, hidden_size=256).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(lr=0.001, momentum=0.9, params=model.parameters())
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

## Training for few epochs 

In [None]:
trained_model = train_model(model, criterion, optimizer, exp_lr_scheduler, 20, dataloaders)

Epoch 0/19
----------


## Saving Model Weights

In [None]:
torch.save(trained_model.state_dict(), 'classification_tained_model.pth')

## Inference on test-set and saving predictions

In [None]:
preds = predict_classification(trained_model, dataloaders['test'])
preds.to_csv('classification_prediction.csv', index=False)