# Deep Music Genre Classification

In [444]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Lets first download our dataset

In [445]:
import pandas as pd

url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

Let's check out a sample row

In [446]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0


Here, we can see the different genres represented and their associated frequencies

In [447]:
df.groupby("genre").size()

genre
blues      4604
country    5445
hip hop     904
jazz       3845
pop        7042
reggae     2498
rock       4034
dtype: int64

Lets make sure to encode each of these genres:

In [448]:
genres = {
    "blues"     :  0,
    "country"   :  1,
    "hip hop"   :  2,
    "jazz"      :  3,
    "pop"       :  4,
    "reggae"    :  5,
    "rock"      :  6,
}

df = df[df["genre"].apply(lambda x: x in genres.keys())]

In [449]:
df["genre"] = df["genre"].apply(genres.get)

Now, we can wrap our Pandas dataframe as a Torch dataset. 

In [450]:
from torch.utils.data import Dataset, DataLoader

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, index):
        return self.df.iloc[index, 5], self.df.iloc[index, 4]

    def __len__(self):
        return len(self.df) 

Now, we can perform a train/validation split and make datasets from each one

In [451]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df,shuffle = True, test_size = 0.2)

Now, we have a training data set with two columns:  

1. Lyrics
2. Genre  

Lets see an example

In [452]:
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)
train_data[100]

('somebody better shake somebody better turn head scratch like wild spit grind venom run gonna wild snakebite snakebite lover hide baby understand snakebite drag snakebite snakebite gonna gonna touch rebel skin break like matchstick baby kind mood face tattoo shoulder scratch bike yeah venom runnin gonna run scar right',
 0)

Now, we are ready to build a vocabulary. First, we need to split each sentence into individual words. To do this, we will use a tokenizer.

In [453]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

# Example
tokenized = tokenizer(train_data[194][0])


Next, we will write a function to get tokens from the lyrics column

In [454]:
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

# Make vocab

vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"], min_freq = 10)
vocab.set_default_index(vocab["<unk>"])

Each word will have a unique mapping to an integer in the vocabulary. Here is the start of our vocab:

In [455]:
vocab.get_itos()[0:10]

['<unk>',
 'know',
 'like',
 'time',
 'come',
 'go',
 'feel',
 'away',
 'heart',
 'yeah']

##### Batch collation

Here, we will develop the steps necessary to pass a batch of data to our training loop. Here are the steps:  

1. Pull feature data (ex. batch of lyrics)
2. Represent each lyrics as sequence of integers from vocab
3. Pad the lyrics with unused integer index to keep length consistent
4. Return the batch of lyrics as consolidated tensor

In [456]:
# Max length for lyrics
max_len = 50

# Count total number of tokens in vocab
num_tokens = len(vocab.get_itos())

# Make pipeline function
def text_pipeline(x):

    # First, we will make tokens for each word in lyrics
    tokens = vocab(tokenizer(x))
    
    # Here, we will make a torch dataset with all 0's
    # The length will be of size max_len
    # We will add num_tokens to each value
    y = torch.zeros(max_len, dtype=torch.int64) + num_tokens
    
    # If tokens > max tokens allowed, subset
    if len(tokens) > max_len:
        tokens = tokens[0:max_len]
    
    # Fix y to be the correct value for each token
    # If there are not enough tokens, 
    # they will be represented by num_tokens
    y[0:len(tokens)] = torch.tensor(tokens,dtype=torch.int64)
    return y

# Here, we write a simple function to convert 
# our label to integers instead of strings
label_pipeline = lambda x: int(x)

Lets test our our function with simple lyrics

In [457]:
text_pipeline("Apple Banana Carrot Tomato")

tensor([1453, 2486,    0, 7529, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516,
        8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516,
        8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516,
        8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516, 8516,
        8516, 8516])

As we can see, the first 4 words are found in the vocabulary and represented as their corresponding integers. The function sucessfully pads the remaining values.

In [458]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:

        # add label to list
         label_list.append(label_pipeline(_label))

         # add text (as sequence of integers) to list
         processed_text = text_pipeline(_text)
         text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list)
    return label_list.to(device), text_list.to(device)

In [459]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn=collate_batch)

Now, lets take a look at our batch of data now:  

The first element is a list of the labels, and the second is the concatenated sequence of integers representing 8 song lyrics.

In [460]:
#next(iter(train_loader))

1. How do the embeddings work?
2. Go over model - forward steps
3. After we flatten, the tensor is 8x150, and the words leave their arrays - how does this make sense? Will they regain significance in the next loop?

In [461]:
from torch import nn

class TextClassificationModel(nn.Module):
    
    def __init__(self,vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.dropout = nn.Dropout(p=0.2)
        self.fc   = nn.Linear(max_len*embedding_dim, 50)
        self.fc2   = nn.Linear(max_len*embedding_dim, num_class)
        
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = torch.flatten(x, 1)
        # Insert mean
        x = self.fc(x)
        return(x)

Now, we will create a word embedding to relate words together:

In [462]:
vocab_size = len(vocab)
embedding_dim = 20
model = TextClassificationModel(vocab_size, embedding_dim, max_len, 8).to(device)

Before we run our model, lets find a base rate so we know if our model learns:

In [463]:
class_counts = df['genre'].value_counts()
most_common_class = class_counts.idxmax()
base_rate = (class_counts[most_common_class] / len(df)) * 100

print(f"Base rate: {base_rate:.2f}%")

Base rate: 24.82%


In [464]:
import time

optimizer = torch.optim.Adam(model.parameters(), lr=.1)
loss_fn = torch.nn.CrossEntropyLoss()

def train(dataloader):
    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    log_interval = 300
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        # zero gradients
        optimizer.zero_grad()
        # form prediction on batch
        predicted_label = model(text)
        # evaluate loss on prediction
        loss = loss_fn(predicted_label, label)
        # compute gradient
        loss.backward()
        # take an optimization step
        optimizer.step()

        # for printing accuracy
        total_acc   += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')
    # print('| end of epoch {:3d} | time: {:5.2f}s | '.format(epoch,
    #                                        time.time() - epoch_start_time))
    
def evaluate(dataloader):

    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [465]:
#%pip install torchinfo

from torchinfo import summary

INPUT_SHAPE = (1,max_len)
summary(model, INPUT_SHAPE, dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
TextClassificationModel                  [1, 50]                   8,008
├─Embedding: 1-1                         [1, 50, 20]               170,340
├─Dropout: 1-2                           [1, 50, 20]               --
├─Linear: 1-3                            [1, 50]                   50,050
Total params: 228,398
Trainable params: 228,398
Non-trainable params: 0
Total mult-adds (M): 0.22
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.88
Estimated Total Size (MB): 0.89

In [None]:
EPOCHS = 50
for epoch in range(1, EPOCHS + 1):
    train(train_loader)

| epoch   1 | train accuracy    0.195 | time: 14.40s
| epoch   2 | train accuracy    0.264 | time: 17.16s
| epoch   3 | train accuracy    0.324 | time: 16.85s
| epoch   4 | train accuracy    0.377 | time: 17.22s
| epoch   5 | train accuracy    0.428 | time: 18.19s
| epoch   6 | train accuracy    0.462 | time: 19.10s
| epoch   7 | train accuracy    0.501 | time: 18.19s
| epoch   8 | train accuracy    0.546 | time: 19.03s
| epoch   9 | train accuracy    0.571 | time: 18.67s
| epoch  10 | train accuracy    0.590 | time: 19.10s
| epoch  11 | train accuracy    0.612 | time: 19.05s
| epoch  12 | train accuracy    0.631 | time: 23.60s
| epoch  13 | train accuracy    0.657 | time: 19.77s
| epoch  14 | train accuracy    0.671 | time: 19.03s
| epoch  15 | train accuracy    0.688 | time: 19.22s
| epoch  16 | train accuracy    0.697 | time: 18.94s
| epoch  17 | train accuracy    0.713 | time: 19.28s
| epoch  18 | train accuracy    0.720 | time: 19.19s
| epoch  19 | train accuracy    0.731 | time: 