# Abstract

-- Enter Here --

# Data

In [30]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from torchinfo import summary

import pandas as pd
import numpy as np
import time

# for train-test split
from sklearn.model_selection import train_test_split

# for suppressing bugged warnings from torchinfo
import warnings
warnings.filterwarnings("ignore", category = UserWarning)

# tokenizers from HuggingFace
from transformers import BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

We are loading in a [Kaggle dataset](https://www.kaggle.com/datasets/saurabhshahane/music-dataset-1950-to-2019) that contains information about music made between the years 1950 and 2019 collected through Spotify. The dataset contains lyrics, artist info, track names, etc. Importantly it also includes music metadata like sadness, danceability, loudness, acousticness, etc.

In [31]:
url = "https://raw.githubusercontent.com/PhilChodrow/PIC16B/master/datasets/tcc_ceds_music.csv"
df = pd.read_csv(url)

Lets have a look at some of the raw data!

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


Here is a brief look at how many songs we have in each represented genre.

In [33]:
df.groupby("genre").size()

genre
blues      4604
country    5445
hip hop     904
jazz       3845
pop        7042
reggae     2498
rock       4034
dtype: int64

This is a pretty large number of songs to classify... and some genres I personally dont care for. So, to make the dataframe more manageable and applicable to me personally, we are going to narrow down to only observe reggae, hip hop, rock and jazz.

In [34]:
genres = {
    "hip hop"   : 0,
    "jazz" : 1,
    "reggae" : 2,
    "rock" : 3,
}

df = df[df["genre"].apply(lambda x: x in genres.keys())]
df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
17091,54304,gene ammons,it's the talk of the town,1950,jazz,lovers sweethearts hard understand know happen...,61,0.001096,0.001096,0.001096,...,0.31957,0.001096,0.352323,0.620388,0.868474,0.23583,0.430132,0.28226,sadness,1.0
17092,54305,gene ammons,you go to my head,1950,jazz,head linger like haunt refrain spin round brai...,48,0.001754,0.340964,0.001754,...,0.001754,0.001754,0.3794,0.638541,0.90763,0.90081,0.22197,0.184159,violence,1.0
17093,54307,bud powell,yesterdays,1950,jazz,music speak start hear musicians like dizzy gi...,107,0.001144,0.001144,0.074762,...,0.001144,0.097082,0.489873,0.4674,0.992972,0.927126,0.334295,0.228204,music,1.0
17094,54311,tony bennett,stranger in paradise,1950,jazz,hand stranger paradise lose wonderland strange...,41,0.002105,0.180524,0.002105,...,0.527429,0.002105,0.179032,0.55947,0.983936,0.001781,0.086974,0.235211,sadness,1.0
17095,54313,dean martin,zing-a zing-a zing boom,1950,jazz,zinga zinga zinga zinga zinga zinga zinga zing...,160,0.001253,0.001253,0.001253,...,0.425721,0.001253,0.580851,0.687409,0.655622,0.0,0.936109,0.4184,sadness,1.0


In [35]:
df["genre"] = df["genre"].apply(genres.get)
df

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
17091,54304,gene ammons,it's the talk of the town,1950,1,lovers sweethearts hard understand know happen...,61,0.001096,0.001096,0.001096,...,0.319570,0.001096,0.352323,0.620388,0.868474,0.235830,0.430132,0.282260,sadness,1.000000
17092,54305,gene ammons,you go to my head,1950,1,head linger like haunt refrain spin round brai...,48,0.001754,0.340964,0.001754,...,0.001754,0.001754,0.379400,0.638541,0.907630,0.900810,0.221970,0.184159,violence,1.000000
17093,54307,bud powell,yesterdays,1950,1,music speak start hear musicians like dizzy gi...,107,0.001144,0.001144,0.074762,...,0.001144,0.097082,0.489873,0.467400,0.992972,0.927126,0.334295,0.228204,music,1.000000
17094,54311,tony bennett,stranger in paradise,1950,1,hand stranger paradise lose wonderland strange...,41,0.002105,0.180524,0.002105,...,0.527429,0.002105,0.179032,0.559470,0.983936,0.001781,0.086974,0.235211,sadness,1.000000
17095,54313,dean martin,zing-a zing-a zing boom,1950,1,zinga zinga zinga zinga zinga zinga zinga zing...,160,0.001253,0.001253,0.001253,...,0.425721,0.001253,0.580851,0.687409,0.655622,0.000000,0.936109,0.418400,sadness,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,82447,mack 10,10 million ways,2019,0,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,...,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,82448,m.o.p.,ante up (robbin hoodz theory),2019,0,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,...,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,82449,nine,whutcha want?,2019,0,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,...,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,82450,will smith,switch,2019,0,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,...,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


As an english speaker I would selfishly only like to classify english songs. To do so we are going to make use of the langdetect 

The base rate on our classification is the proportion of the data set occupied by the largest label class:

In [36]:
df.groupby("genre").size() / len(df)

genre
0    0.080135
1    0.340839
2    0.221434
3    0.357592
dtype: float64

If we always guessed category 3, then we would expect an accuracy of roughly 36%. So, our task will be to see whether we can train a model to beat this. 

As we try to predict the genre of the track, we will use lyrics alongside some other engineered features (metadata) that we define below.

In [37]:
engineered_features = ['dating', 'violence', 'world/life', 'night/time','shake the audience','family/gospel', 'romantic', 'communication','obscene', 'music', 'movement/places', 'light/visual perceptions','family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability','loudness', 'acousticness', 'instrumentalness', 'valence', 'energy']      

# Text Vectorization

We now need to *vectorize* the lyrics. We’re going to use **tokenization** to break up the lyrics into a sequence of tokens, and then vectorize that sequence.

We will be using a tokenizer imported from HuggingFace.

In [42]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

For our purposes it’s more convenient to assign an *integer* to each token, which we can do like this:

In [43]:
encoded = tokenizer("I love reggae music!")
encoded

{'input_ids': [101, 1045, 2293, 15662, 2189, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

To do the reverse, we can use the `.decode` method of the tokenizer:

In [41]:
tokenizer.decode(encoded["input_ids"])

'[CLS] i love reggae music! [SEP]'

# Model Building 

We are going to define **three** neural networks for our classification tasks, so we will need three data frames.

- Using Lyrics to Classify
- Using Engineered Features (Metadata) to Classify
- Using Lyrics and Metadata to Classify

In [38]:
df_engineered = df[engineered_features].copy()
df_lyrics = df['lyrics'].copy()
df_both = df[engineered_features + ['lyrics']].copy()
df_both.head()

Unnamed: 0,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,...,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,lyrics
17091,0.001096,0.001096,0.001096,0.001096,0.036316,0.001096,0.001096,0.460773,0.086498,0.001096,...,0.001096,0.31957,0.001096,0.352323,0.620388,0.868474,0.23583,0.430132,0.28226,lovers sweethearts hard understand know happen...
17092,0.001754,0.340964,0.001754,0.001754,0.001754,0.001754,0.131872,0.001754,0.001754,0.001754,...,0.328447,0.001754,0.001754,0.3794,0.638541,0.90763,0.90081,0.22197,0.184159,head linger like haunt refrain spin round brai...
17093,0.001144,0.001144,0.074762,0.046173,0.001144,0.018789,0.001144,0.001655,0.001144,0.421734,...,0.230954,0.001144,0.097082,0.489873,0.4674,0.992972,0.927126,0.334295,0.228204,music speak start hear musicians like dizzy gi...
17094,0.002105,0.180524,0.002105,0.002105,0.002105,0.002105,0.002105,0.201965,0.002105,0.002105,...,0.002105,0.527429,0.002105,0.179032,0.55947,0.983936,0.001781,0.086974,0.235211,hand stranger paradise lose wonderland strange...
17095,0.001253,0.001253,0.001253,0.001253,0.001253,0.081126,0.001253,0.111951,0.001253,0.268737,...,0.001253,0.425721,0.001253,0.580851,0.687409,0.655622,0.0,0.936109,0.4184,zinga zinga zinga zinga zinga zinga zinga zing...
