### Dependencies and imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install -U sentence-transformers
!pip install -U transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 83.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
import os
import torch
from torch import nn
import torch.optim as optim

from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel

import pandas as pd
import numpy as np
import tqdm, glob

from torch.utils.data import Dataset, DataLoader
from torchvision.models.feature_extraction import create_feature_extractor

import random


In [None]:
%cd '/content/drive/MyDrive/multi-modal-music-genre-classification/'
from CNN import CNN
from utils import load_data

/content/drive/.shortcut-targets-by-id/199GHGZBtQOuthTQ8c7i5neHpvqm7epIw/multi-modal-music-genre-classification


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


### Lyric Module

In [None]:
class LyricModule(nn.Module):
  def __init__(self, pretrained_lyric_model): #, max_seq_len):
    super(LyricModule, self).__init__()

    # # Pretrained lyric model to get lyric embeddings
    self.pretrained = pretrained_lyric_model
    # print("Max Sequence Length:", self.pretrained.max_seq_length)
    # self.pretrained.max_seq_length = max_seq_len # common for BERT and its derivatives
    # print("Max Sequence Length:", self.pretrained.max_seq_length)

    # self.lyric_dim = self.pretrained.get_sentence_embedding_dimension()
    self.embedding_dim = 768

  def forward(self, input_ids, attention_mask):
    '''
    input_ids: shape (batch_size, num_tokens)
    attention_mask: shape (batch_size, num_tokens)

    embeddings: shape (batch_size, num_tokens, embedding_dim)
    pooled: shape (batch_size, embedding_dim)
    '''
    # embeddings = self.pretrained.encode(lyrics, convert_to_tensor=True).to(device)
    bert_output = self.pretrained(input_ids = input_ids, attention_mask = attention_mask)
    embeddings = bert_output.last_hidden_state
    pooled = bert_output.pooler_output

    return embeddings, pooled

### Genre Classifier Module

In [None]:
class GenreClassifier(nn.Module):
    def __init__(
        self,
        spectrogram_model,
        lyric_model,
        cma_model,
        num_classes
        ):
        super(GenreClassifier, self).__init__()

        self.lyric_model = lyric_model
        self.spectrogram_model = spectrogram_model
        # self.cma_model = cma_model

        self.num_classes = num_classes

        # FC layers after CMA
        # self.linear = nn.Linear(self.cma_model.embedding_size, self.num_classes)

        self.lyric_linear_layers = nn.Sequential(
            nn.ReLU(),
            nn.Linear(self.lyric_model.embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 16),
            nn.ReLU(),
            # nn.Linear(64, self.num_classes)
            )
        self.cnn_linear_layers = nn.Sequential(
            nn.ReLU(),
            # nn.Linear(64, self.num_classes)
        )
        self.concat_linear_layers = nn.Sequential(
            nn.Linear(64+16, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, self.num_classes)
        )

    def forward(self, input_ids, attention_mask, spectrograms):
      '''
      input_ids: shape (batch_size, num_tokens)
      attention_mask: shape (batch_size, num_tokens)
      spectrograms: shape (batch_size, ??)
      '''
      #### get embeddings 
      token_embeddings, pooled = self.lyric_model(input_ids, attention_mask)
      spectrogram_embeddings = self.spectrogram_model(spectrograms.float())['linear1']

      #### linear layers

      output = torch.cat((self.cnn_linear_layers(spectrogram_embeddings), self.lyric_linear_layers(pooled)), 1)
      output = self.concat_linear_layers(output)
      return output 

      #### lyric module only
      # output = self.lyric_linear_layers(pooled)

      # output = self.cma(lyric_embeddings, spectrogram_embeddings)

      #TODO: add linear layers 
      # output = self.linear(output)

      return output

## Training

### Load data

In [None]:
BATCH_SIZE = 64

In [None]:
# pretrained_lyric_model = SentenceTransformer('all-mpnet-base-v2')
# pretrained_lyric_model.get_sentence_embedding_dimension()
pretrained_lyric_model = BertModel.from_pretrained('bert-base-uncased')
pretrained_lyric_model.eval()
# FREEZE BERT MODEL
for param in pretrained_lyric_model.parameters():
  param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
genre_list = np.load("genre_list.npy")
# zipped_data, genre_list = load_data()
NUM_CLASSES = len(genre_list)
print(NUM_CLASSES)

5


In [None]:
# custom data loader

class CustomDataset(Dataset):
    def __init__(self,split):
        print("Creating custom dataset ...")

        self.specs_path = f"{split}_specs.npy"
        self.tokens_path = f"{split}_tokens.npy"
        self.attention_mask_path = f"{split}_attention_masks.npy"
        self.y_path = f"{split}_y.npy"


        # # self.input, self.target = zip(*split)
        # # self.melspec, self.lyrics = zip(*self.input)

        self.melspec = torch.unsqueeze(torch.tensor(np.stack(np.load(self.specs_path, allow_pickle=True))), 1)
        self.tokens = torch.tensor(np.load(self.tokens_path))
        self.attention_mask = torch.tensor(np.load(self.attention_mask_path))
        self.target = torch.tensor(np.load(self.y_path))
        # #### reshape spectrograms
        # self.melspec = torch.from_numpy(np.array([x.reshape( (1, 128, 431) ) for x in self.melspec]))
        print("melspec shape:", self.melspec.shape)

        # #### tokenize lyrics
        # # self.lyrics = list(self.lyrics)
        # # tokenizer_output = tokenizer(self.lyrics, padding=True, truncation=True, return_tensors="pt")
        # # self.input_ids = tokenizer_output['input_ids']
        # # self.attention_mask = tokenizer_output['attention_mask']
        print("tokens shape:", self.tokens.shape)
        print("attention mask shape:", self.attention_mask.shape)
        print("target shape:", self.target.shape)

        # self.target = torch.tensor(target)
        # print("-------------------")


        
    def __getitem__(self, index):
        
        # #melspec, lyric data-pair

        data_pair = (self.melspec[index], (self.tokens[index], self.attention_mask[index]))
        label = self.target[index]
        return data_pair, label



    def __len__(self):
        count = len(self.target)
        return count 

In [None]:
#### data separation 
# dataset = zipped_data
# random.shuffle(dataset)
# del(dataset)

# trainDataset = CustomDataset(train, tokenizer, pretrained_lyric_model)
# del(train)
# valDataset = CustomDataset(val, tokenizer, pretrained_lyric_model)
# del(val)
# testDataset = CustomDataset(test, tokenizer, pretrained_lyric_model)
# del(test)

# del(pretrained_lyric_model)
# del(tokenizer)
trainDataset = CustomDataset("train")
valDataset = CustomDataset("val")
testDataset = CustomDataset("test")
# del(specs)
# del(tokens)
# del(attention_mask)

trainLoader = DataLoader(trainDataset, batch_size=BATCH_SIZE)
valLoader = DataLoader(valDataset, batch_size=BATCH_SIZE)
testLoader = DataLoader(testDataset)


Creating custom dataset ...
melspec shape: torch.Size([6896, 1, 128, 431])
tokens shape: torch.Size([6896, 512])
attention mask shape: torch.Size([6896, 512])
target shape: torch.Size([6896])
Creating custom dataset ...
melspec shape: torch.Size([862, 1, 128, 431])
tokens shape: torch.Size([862, 512])
attention mask shape: torch.Size([862, 512])
target shape: torch.Size([862])
Creating custom dataset ...
melspec shape: torch.Size([863, 1, 128, 431])
tokens shape: torch.Size([863, 512])
attention mask shape: torch.Size([863, 512])
target shape: torch.Size([863])


In [None]:
del(trainDataset)
del(valDataset)
del(testDataset)
# del(zipped_data)
# del(trainLoader)
# del(testLoader)
# del(valLoader)
# del(model)

In [None]:
import gc
gc.collect()

121

### Define model

In [None]:
pretrained_cnn_model = CNN(num_classes = NUM_CLASSES).to(device)
pretrained_cnn_model.load_state_dict(torch.load('CNN_pytorch_training/saved_model.pth'))

<All keys matched successfully>

In [None]:
lyric_model = LyricModule(pretrained_lyric_model)

spectrogram_model = create_feature_extractor(pretrained_cnn_model, return_nodes=['linear1'])

# FREEZE CNN MODEL
for param in spectrogram_model.parameters():
  param.requires_grad = False 

cma_model = None

model = GenreClassifier(spectrogram_model, lyric_model, cma_model, NUM_CLASSES).float().to(device)

In [None]:
for param in model.named_parameters():
  print(param[0], param[1][1].requires_grad)

lyric_model.pretrained.embeddings.word_embeddings.weight False
lyric_model.pretrained.embeddings.position_embeddings.weight False
lyric_model.pretrained.embeddings.token_type_embeddings.weight False
lyric_model.pretrained.embeddings.LayerNorm.weight False
lyric_model.pretrained.embeddings.LayerNorm.bias False
lyric_model.pretrained.encoder.layer.0.attention.self.query.weight False
lyric_model.pretrained.encoder.layer.0.attention.self.query.bias False
lyric_model.pretrained.encoder.layer.0.attention.self.key.weight False
lyric_model.pretrained.encoder.layer.0.attention.self.key.bias False
lyric_model.pretrained.encoder.layer.0.attention.self.value.weight False
lyric_model.pretrained.encoder.layer.0.attention.self.value.bias False
lyric_model.pretrained.encoder.layer.0.attention.output.dense.weight False
lyric_model.pretrained.encoder.layer.0.attention.output.dense.bias False
lyric_model.pretrained.encoder.layer.0.attention.output.LayerNorm.weight False
lyric_model.pretrained.encoder.lay

### Hyperparameters

In [None]:
learning_rate = 1e-3
epochs = 10

### Loss function

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Training Loop

In [None]:
import tqdm

In [None]:
model = GenreClassifier(spectrogram_model, lyric_model, cma_model, NUM_CLASSES).float().to(device)
model.load_state_dict(torch.load('model_concat_trial1.pth'))

<All keys matched successfully>

In [None]:
# iterate through all the epochs
for epoch in range(epochs):
    # go through all the batches generated by dataloader

    ############## train loop
    model.train()
    train_loss = 0.0
    for i, (X, y) in tqdm.notebook.tqdm(enumerate(trainLoader)):

      specs = X[0]
      input_ids, attention_mask = X[1]
      pred = model(input_ids.to(device), attention_mask.to(device), specs.to(device))

      optimizer.zero_grad()
      # calculate loss
      loss = criterion(pred, y.to(device))
      # credit assignment
      loss.backward()
      # update model weights
      optimizer.step()

      train_loss += loss.item()
    
    ############## validation loop
    model.eval()
    valid_loss = 0.0 
    for i, (X, y) in tqdm.notebook.tqdm(enumerate(valLoader)):

      specs = X[0]
      input_ids, attention_mask = X[1]
      pred = model(input_ids.to(device), attention_mask.to(device), specs.to(device))

      # calculate loss
      loss = criterion(pred, y.to(device))
      valid_loss += loss.item() 
    

    print(f'Epoch {epoch+1} \t\t Training Loss: {train_loss / len(trainLoader)} \t\t Validation Loss: {valid_loss / len(valLoader)}')
    
    # if min_valid_loss > valid_loss:
    #     print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
    #     min_valid_loss = valid_loss
        
    #     # Saving State Dict
    # torch.save(model.state_dict(), '/content/drive/MyDrive/multi-modal-music-genre-classification/model_concat_trial1.pth')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 1 		 Training Loss: 1.1564701067076788 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2 		 Training Loss: 1.1513830191559262 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 3 		 Training Loss: 1.1543089134825602 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 4 		 Training Loss: 1.1568427803339782 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 5 		 Training Loss: 1.1531141781144671 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 6 		 Training Loss: 1.15808361823912 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 7 		 Training Loss: 1.1568266699711482 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 8 		 Training Loss: 1.1588512201000143 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 9 		 Training Loss: 1.147485015017015 		 Validation Loss: 1.126680122954505


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 10 		 Training Loss: 1.1590603882515873 		 Validation Loss: 1.126680122954505


In [None]:
############## test loop
test_loss = 0.0
total = 0.0
num_correct = 0.0

# loop over all batches in test set
for i, (X, y) in tqdm.notebook.tqdm(enumerate(testLoader)):

  specs = X[0]
  input_ids, attention_mask = X[1]
  pred = model(input_ids.to(device), attention_mask.to(device), specs.to(device))

  # pred = pretrained_cnn_model(specs.float())
  res = torch.argmax(pred, 1)

  targets = y.to(device)
  num_correct += (res.item() == targets.item())
  total += targets.size(0)
  test_loss += criterion(pred, targets).item()
    

print(f"Test Accuracy of the model: {float(num_correct)/float(total)*100:.2f}")

0it [00:00, ?it/s]

Test Accuracy of the model: 53.65


In [None]:
%pwd

'/content/drive/.shortcut-targets-by-id/199GHGZBtQOuthTQ8c7i5neHpvqm7epIw/multi-modal-music-genre-classification'

In [None]:
torch.save(model.state_dict(), "model_concat_trial1.pth")

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1]):
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                   zipped_data: 72.1 KiB
                         train: 53.9 KiB
                          test:  6.8 KiB
                           val:  6.8 KiB
                 BertTokenizer:  2.0 KiB
                           _i6:  1.8 KiB
                          _iii:  1.6 KiB
                           _i9:  1.6 KiB
                     BertModel:  1.4 KiB
                           CNN:  1.4 KiB
                    DataLoader:  1.0 KiB
                       Dataset:  1.0 KiB
               GenreClassifier:  1.0 KiB
                 CustomDataset:  1.0 KiB
                           _ii:  598.0 B
                          _i10:  598.0 B
                          _i12:  582.0 B
                            _i:  533.0 B
                          _i11:  533.0 B
                           _i3:  510.0 B
                           _oh:  248.0 B
                           Out:  248.0 B
                           _ih:  200.0 B
                            In:  200.0 B
                

### Embeddings visualization

In [None]:
classifier = GenreClassifier(spectrogram_model, lyric_model, cma_model, NUM_CLASSES).float().to(device)
classifier.load_state_dict(torch.load("model_concatenation_55.pth"))

1. finetuning CNN model --> val loss increases