# Abstract

-- Enter Here --

# Data

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from torchinfo import summary

import pandas as pd
import numpy as np
import time

# for train-test split
from sklearn.model_selection import train_test_split

# for suppressing bugged warnings from torchinfo
import warnings
warnings.filterwarnings("ignore", category = UserWarning)

# tokenizers from HuggingFace
from transformers import BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


We are loading in a [Kaggle dataset](https://www.kaggle.com/datasets/saurabhshahane/music-dataset-1950-to-2019) that contains information about music made between the years 1950 and 2019 collected through Spotify. The dataset contains lyrics, artist info, track names, etc. Importantly it also includes music metadata like sadness, danceability, loudness, acousticness, etc.

In [2]:
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

Lets have a look at some of the raw data!

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


Here is a brief look at how many songs we have in each represented genre.

In [4]:
df.groupby("genre").size()

genre
blues      4604
country    5445
hip hop     904
jazz       3845
pop        7042
reggae     2498
rock       4034
dtype: int64

This is a pretty large number of songs to classify... and some genres I personally dont care for. So, to make the dataframe more manageable and applicable to me personally, we are going to narrow down to only observe reggae, hip hop, rock and jazz.

In [5]:
genres = {
    "hip hop"   : 0,
    "jazz" : 1,
    "reggae" : 2,
    "rock" : 3,
}

df = df[df["genre"].apply(lambda x: x in genres.keys())]
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
17091,54304,gene ammons,it's the talk of the town,1950,jazz,lovers sweethearts hard understand know happen...,61,0.001096,0.001096,0.001096,...,0.31957,0.001096,0.352323,0.620388,0.868474,0.23583,0.430132,0.28226,sadness,1.0
17092,54305,gene ammons,you go to my head,1950,jazz,head linger like haunt refrain spin round brai...,48,0.001754,0.340964,0.001754,...,0.001754,0.001754,0.3794,0.638541,0.90763,0.90081,0.22197,0.184159,violence,1.0
17093,54307,bud powell,yesterdays,1950,jazz,music speak start hear musicians like dizzy gi...,107,0.001144,0.001144,0.074762,...,0.001144,0.097082,0.489873,0.4674,0.992972,0.927126,0.334295,0.228204,music,1.0
17094,54311,tony bennett,stranger in paradise,1950,jazz,hand stranger paradise lose wonderland strange...,41,0.002105,0.180524,0.002105,...,0.527429,0.002105,0.179032,0.55947,0.983936,0.001781,0.086974,0.235211,sadness,1.0
17095,54313,dean martin,zing-a zing-a zing boom,1950,jazz,zinga zinga zinga zinga zinga zinga zinga zing...,160,0.001253,0.001253,0.001253,...,0.425721,0.001253,0.580851,0.687409,0.655622,0.0,0.936109,0.4184,sadness,1.0


In [6]:
df["genre"] = df["genre"].apply(genres.get)
df

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
17091,54304,gene ammons,it's the talk of the town,1950,1,lovers sweethearts hard understand know happen...,61,0.001096,0.001096,0.001096,...,0.319570,0.001096,0.352323,0.620388,0.868474,0.235830,0.430132,0.282260,sadness,1.000000
17092,54305,gene ammons,you go to my head,1950,1,head linger like haunt refrain spin round brai...,48,0.001754,0.340964,0.001754,...,0.001754,0.001754,0.379400,0.638541,0.907630,0.900810,0.221970,0.184159,violence,1.000000
17093,54307,bud powell,yesterdays,1950,1,music speak start hear musicians like dizzy gi...,107,0.001144,0.001144,0.074762,...,0.001144,0.097082,0.489873,0.467400,0.992972,0.927126,0.334295,0.228204,music,1.000000
17094,54311,tony bennett,stranger in paradise,1950,1,hand stranger paradise lose wonderland strange...,41,0.002105,0.180524,0.002105,...,0.527429,0.002105,0.179032,0.559470,0.983936,0.001781,0.086974,0.235211,sadness,1.000000
17095,54313,dean martin,zing-a zing-a zing boom,1950,1,zinga zinga zinga zinga zinga zinga zinga zing...,160,0.001253,0.001253,0.001253,...,0.425721,0.001253,0.580851,0.687409,0.655622,0.000000,0.936109,0.418400,sadness,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,82447,mack 10,10 million ways,2019,0,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,...,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,82448,m.o.p.,ante up (robbin hoodz theory),2019,0,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,...,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,82449,nine,whutcha want?,2019,0,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,...,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,82450,will smith,switch,2019,0,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,...,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


The base rate on our classification is the proportion of the data set occupied by the largest label class:

In [7]:
df.groupby("genre").size() / len(df)

genre
0    0.080135
1    0.340839
2    0.221434
3    0.357592
dtype: float64

If we always guessed category 3, then we would expect an accuracy of roughly 36%. So, our task will be to see whether we can train a model to beat this. 

As we try to predict the genre of the track, we will use lyrics alongside some other engineered features (metadata) that we define below.

In [8]:
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']      

Our models will only need these engineered features, lyrics, and our target value which will be *genre* so we can throw them all into the same dataframe and use slicing to access different parts later.

In [9]:
df_clean= df[engineered_features + ['lyrics', 'genre']].copy()
df_clean.head()

Unnamed: 0,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,lyrics,genre
17091,0.001096,0.001096,0.001096,0.001096,0.036316,0.001096,0.001096,0.460773,0.086498,0.001096,...,0.31957,0.001096,0.352323,0.620388,0.868474,0.23583,0.430132,0.28226,lovers sweethearts hard understand know happen...,1
17092,0.001754,0.340964,0.001754,0.001754,0.001754,0.001754,0.131872,0.001754,0.001754,0.001754,...,0.001754,0.001754,0.3794,0.638541,0.90763,0.90081,0.22197,0.184159,head linger like haunt refrain spin round brai...,1
17093,0.001144,0.001144,0.074762,0.046173,0.001144,0.018789,0.001144,0.001655,0.001144,0.421734,...,0.001144,0.097082,0.489873,0.4674,0.992972,0.927126,0.334295,0.228204,music speak start hear musicians like dizzy gi...,1
17094,0.002105,0.180524,0.002105,0.002105,0.002105,0.002105,0.002105,0.201965,0.002105,0.002105,...,0.527429,0.002105,0.179032,0.55947,0.983936,0.001781,0.086974,0.235211,hand stranger paradise lose wonderland strange...,1
17095,0.001253,0.001253,0.001253,0.001253,0.001253,0.081126,0.001253,0.111951,0.001253,0.268737,...,0.425721,0.001253,0.580851,0.687409,0.655622,0.0,0.936109,0.4184,zinga zinga zinga zinga zinga zinga zinga zing...,1


Finally, we will perform a train-validation split to later evaluate our data

In [10]:
df_train, df_val = train_test_split(df_clean,shuffle = True, test_size = 0.2)

# Text Vectorization

We now need to *vectorize* the lyrics. We’re going to use **tokenization** to break up the lyrics into a sequence of tokens, and then vectorize that sequence.

We will be using a tokenizer imported from HuggingFace.

In [11]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

For our purposes it’s more convenient to assign an *integer* to each token, which we can do like this:

In [12]:
encoded = tokenizer("I love reggae music!")
encoded

{'input_ids': [101, 1045, 2293, 15662, 2189, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

To do the reverse, we can use the `.decode` method of the tokenizer:

In [13]:
tokenizer.decode(encoded["input_ids"])

'[CLS] i love reggae music! [SEP]'

Here is some code to help us prepare our dataset with encodings. A lot of our lyrics are different lengths so we will pad the shorter ones with 0s and truncate others that are especially long. We will make use of the torch `Dataset` class to help manage our data.

In [26]:
max_len = 512 # BERT capacity

def preprocess(df, tokenizer, max_len):
    lyrics_tokens = tokenizer(list(df["lyrics"]), padding="max_length", truncation=True, max_length=max_len)["input_ids"]
    engineered = df[engineered_features].values.tolist()
    y = list(df["genre"])
    return lyrics_tokens, engineered, y

class TextDataFromDF(Dataset):
    def __init__(self, df):
        self.lyrics_tokens, self.engineered_feats, self.y = preprocess(df, tokenizer, max_len)

    def __getitem__(self, ix):
        return self.lyrics_tokens[ix], self.engineered_feats[ix], self.y[ix]

    def __len__(self):
        return len(self.y)

Lets make our encoded datasets!

In [27]:
train_data = TextDataFromDF(df_train)
val_data   = TextDataFromDF(df_val)

Here is what a single songs information looks like now:

In [29]:
X_tokens, X_feats, y = train_data[1]
print(X_tokens, X_feats)
print(y)

[101, 4389, 18981, 2063, 2991, 2051, 2298, 19415, 3153, 2166, 2113, 9560, 7318, 9015, 7392, 2298, 3819, 5853, 4333, 2420, 4653, 9467, 3480, 3233, 9015, 7392, 9015, 7392, 9015, 7392, 2173, 16019, 6457, 8489, 2514, 2590, 10667, 6542, 11891, 14380, 2015, 4756, 2754, 9015, 7392, 9015, 7392, 9015, 7392, 9015, 7392, 5223, 7392, 19978, 2567, 3046, 2378, 16382, 2425, 2689, 3971, 15138, 19000, 11703, 20175, 10975, 18908, 5562, 2681, 25933, 4371, 2812, 7655, 2341, 11429, 2611, 5684, 4403, 2147, 2814, 2175, 2627, 2203, 2228, 2147, 2814, 2175, 2627, 2203, 2228, 2147, 2814, 2175, 2627, 2203, 2228, 2147, 2814, 2175, 2627, 2203, 2228, 5223, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

We are going to be feeding data in in batches, so we will need a dataloader which necessitates a collate function to ensure our we are imputing tensors of the right size.

In [47]:
def collate(data):
    tokens = torch.tensor([d[0] for d in data], dtype=torch.long)
    engineered = torch.tensor([d[1] for d in data], dtype=torch.float)
    y = torch.tensor([d[2] for d in data], dtype=torch.long)
    return (tokens, engineered), y

train_loader = DataLoader(train_data, batch_size=8, shuffle=True, collate_fn = collate)
val_loader = DataLoader(val_data, batch_size=8, shuffle=True, collate_fn = collate)

Here is what a batch of data looks like. The predictor data is now a tensor in which the entries give token indices, padded with 0s and another tensor with the  values of our engineered features. For visualization purposes we’ll show only the first 2 rows:

In [48]:
X, y = next(iter(train_loader))
X[:2]

(tensor([[  101,  4284,  2627,  ...,     0,     0,     0],
         [  101, 21110,  2638,  ...,     0,     0,     0],
         [  101,  3147,  6271,  ...,     0,     0,     0],
         ...,
         [  101,  2191,  3066,  ...,     0,     0,     0],
         [  101,  6428,  3114,  ...,     0,     0,     0],
         [  101,  4086,  2189,  ...,     0,     0,     0]]),
 tensor([[6.0496e-04, 1.3729e-01, 3.0337e-01, 6.0496e-04, 6.0496e-04, 6.0496e-04,
          6.0496e-04, 6.0496e-04, 1.4679e-01, 6.0496e-04, 1.3474e-01, 8.5468e-02,
          4.1353e-02, 6.0496e-04, 6.0496e-04, 8.4843e-02, 5.8410e-01, 5.1381e-01,
          2.5100e-01, 3.4818e-01, 6.2490e-01, 4.2841e-01],
         [7.5188e-03, 3.2063e-01, 7.5188e-03, 4.2682e-01, 7.5188e-03, 7.5188e-03,
          7.5188e-03, 7.5188e-03, 7.5188e-03, 7.5188e-03, 7.5188e-03, 7.5188e-03,
          1.3225e-01, 7.5188e-03, 7.5188e-03, 7.5188e-03, 1.7687e-01, 6.4375e-01,
          7.2088e-01, 4.5850e-03, 9.3157e-02, 3.4332e-01],
         [2.2838e-02

In [49]:
y[:2]

tensor([1, 1])

# Model Building 

We are going to train **three** neural networks to classify our genres.

- Using Lyrics to Classify
- Using Engineered Features (Metadata) to Classify
- Using Lyrics and Metadata to Classify

Lets build a model for classifying genres based on lyrics first.

In [91]:
class TextClassificationModel(nn.Module):

    def __init__(self,vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(embedding_dim, num_class) # max_len*embedding_dim
        

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        x = x.mean(axis = 1)
        # x = torch.flatten(x, 1)
        x = self.fc(x)
        return(x)

In [None]:
vocab_size = len(tokenizer.vocab)
embedding_dim = 50
num_class = len(genres)

text_model = TextClassificationModel(vocab_size, embedding_dim, max_len, num_class).to(device)

In [96]:
summary(text_model, input_Size = (8, max_len))

Layer (type:depth-idx)                   Param #
TextClassificationModel                  --
├─Embedding: 1-1                         305,230
├─Dropout: 1-2                           --
├─Linear: 1-3                            44
Total params: 305,274
Trainable params: 305,274
Non-trainable params: 0

In [97]:
def train(model, dataloader):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = torch.nn.CrossEntropyLoss()

    epoch_start_time = time.time()
    # keep track of some counts for measuring accuracy
    total_acc, total_count = 0, 0
    
    for X, y in dataloader:
        # unpack and move to device
        tokens, engineered = X
        tokens = tokens.to(device)
        engineered = engineered.to(device)
        y = y.to(device)

        # zero gradients
        optimizer.zero_grad()
        # form prediction on batch
        predicted_label = model(tokens)
        # evaluate loss on prediction
        loss = loss_fn(predicted_label, y)
        # compute gradient
        loss.backward()
        # take an optimization step
        optimizer.step()
                
        # for printing accuracy
        total_acc += (predicted_label.argmax(1) == y).sum().item()
        total_count += y.size(0)

    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')

def accuracy(model, dataloader):

    total_acc, total_count = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            # unpack and move to device
            tokens, engineered = X
            tokens = tokens.to(device)
            engineered = engineered.to(device)
            y = y.to(device)

            predicted_label = model(tokens)
            total_acc += (predicted_label.argmax(1) == y).sum().item()
            total_count += y.size(0)
    return total_acc/total_count

In [102]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(text_model, train_loader)

| epoch   1 | train accuracy    0.762 | time:  3.24s
| epoch   2 | train accuracy    0.761 | time:  3.09s
| epoch   3 | train accuracy    0.770 | time:  3.26s
| epoch   4 | train accuracy    0.770 | time:  3.10s
| epoch   5 | train accuracy    0.778 | time:  3.13s
| epoch   6 | train accuracy    0.781 | time:  3.60s
| epoch   7 | train accuracy    0.782 | time:  3.14s
| epoch   8 | train accuracy    0.783 | time:  3.23s
| epoch   9 | train accuracy    0.787 | time:  3.29s
| epoch  10 | train accuracy    0.791 | time:  3.10s
| epoch  11 | train accuracy    0.793 | time:  3.07s
| epoch  12 | train accuracy    0.802 | time:  2.87s
| epoch  13 | train accuracy    0.803 | time:  2.84s
| epoch  14 | train accuracy    0.806 | time:  2.94s
| epoch  15 | train accuracy    0.804 | time:  3.00s
| epoch  16 | train accuracy    0.807 | time:  3.20s
| epoch  17 | train accuracy    0.813 | time:  3.11s
| epoch  18 | train accuracy    0.813 | time:  3.10s
| epoch  19 | train accuracy    0.816 | time: 

In [103]:
accuracy(text_model, val_loader)

0.5631369073992025