# CS 6120: Natural Language Processing
## Final Project: Music Genre Classification
## Authors: Bharat Chawla and Himaja R. Ginkala

In [1]:
# importing libraries
import csv
import numpy as np
import pandas as pd
import re
import string
import tensorflow as tf

import torch
import torch.nn as nn
import torch.optim as optim

import keras
import keras.backend as K
from keras.layers import Activation, Attention, Dense, Dropout, Embedding, Flatten, Layer, LSTM, MaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /home/chawla.bh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load data from CSV file
df_english_cleaned = pd.read_csv('english_cleaned.csv')

#### Vocabulary

In [3]:
# load vocabulary from file
with open('vocabulary.txt', "r") as f:
    vocabulary = f.readlines()

print("Vocabulary Size: ", len(vocabulary))

# replace new line characters
vocabulary = list(map(lambda s: s.strip(), vocabulary))

Vocabulary Size:  186126


In [4]:
# create word to id dictionary
word_to_id = {}
for i, word in enumerate(vocabulary):
  word_to_id[word] = i + 1

#### Glove Embeddings

In [5]:
# open glove embeddings file
glove_file = open('glove.6B.100d.txt')

glove_embeddings = {}
for line in glove_file:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype = 'float32')
  glove_embeddings[word] = coefs

glove_file.close()

all_glove_words = list(glove_embeddings.keys())
print("Found %s word vectors." % len(glove_embeddings))

Found 400000 word vectors.


#### Embedding Matrix

In [6]:
embedding_matrix = np.zeros((len(vocabulary), 100))

for word, i in word_to_id.items():
  embedding_vector = glove_embeddings.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(186126, 100)


### Data Preparation

In [7]:
# get only data with cleaned lyrics
df_english = df_english_cleaned[df_english_cleaned['Cleaned_Lyric'].notna()]

#### Genre Separation

In [8]:
# convert multi-valued column to single value
for i, row in df_english.iterrows():
  if ";" in str(df_english.at[i, 'Genres']):
    genres = df_english.at[i, 'Genres'].split("; ")
    df_english.at[i, 'Genres'] = genres[0]

#rename column
df_english.rename(columns = {'Genres':'Genre'}, inplace = True)
df_english['Genre'] = df_english['Genre'].replace('Pop/Rock', 'Pop-Rock')
df_english

Unnamed: 0,Artist,SName,Genre,Lyric,Popularity,Language,Cleaned_Lyric
0,Ivete Sangalo,Careless Whisper,Pop,I feel so unsure\nAs I take your hand and lead...,4.4,en,feel unsure take hand lead dance floor music d...
1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,Pop,"Don't let them fool, ya\nOr even try to school...",4.4,en,let fool ya even try school ya oh got mind go ...
2,Ivete Sangalo,Cruisin' (Part. Saulo),Pop,"Baby, let's cruise, away from here\nDon't be c...",4.4,en,baby let cruise away confused way clear want g...
3,Ivete Sangalo,Easy,Pop,"Know it sounds funny\nBut, I just can't stand ...",4.4,en,know sounds funny stand pain girl leaving tomo...
4,Ivete Sangalo,For Your Babies (The Voice cover),Pop,You've got that look again\nThe one I hoped I ...,4.4,en,got look one hoped lad face beaming smile got ...
...,...,...,...,...,...,...,...
191382,Johnny Clegg,The Waiting,World Music,Chorus\nHere we stand waiting on the plain\nDa...,0.0,en,chorus stand waiting plain darkness taken last...
191383,Johnny Clegg,Too Early For The Sky,World Music,I nearly disappeared into the mouth of a croco...,0.0,en,nearly disappeared mouth crocodile nearly touc...
191384,Johnny Clegg,Warsaw 1943 (I Never Betrayed The Revolution),World Music,"Amambuka, amambuka azothengisa izwe lakithi, i...",0.0,en,amambuka amambuka azothengisa izwe lakithi izw...
191385,Johnny Clegg,When The System Has Fallen,World Music,Sweat in the heat for days on end\nwaiting for...,0.0,en,sweat heat days end waiting come hear words sp...


In [9]:
# get genre count
genre_counts = df_english['Genre'].value_counts()
print("Number of English Genres: ", genre_counts)

# get genre names
genre_names = df_english['Genre'].value_counts().index.tolist()

Number of English Genres:  Rock             25177
Pop              13759
Heavy Metal      13496
Indie            12998
Rap               9589
                 ...  
Electro Swing        6
Jovem Guarda         6
Forró                3
Regional             1
Lo-fi                1
Name: Genre, Length: 73, dtype: int64


In [10]:
# create dataframe of only top 10 genres in English
top_10_genres = genre_names[0:10]
df_english_top_10_genres = df_english[df_english['Genre'].isin(top_10_genres)]
df_english_top_10_genres

Unnamed: 0,Artist,SName,Genre,Lyric,Popularity,Language,Cleaned_Lyric
0,Ivete Sangalo,Careless Whisper,Pop,I feel so unsure\nAs I take your hand and lead...,4.4,en,feel unsure take hand lead dance floor music d...
1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,Pop,"Don't let them fool, ya\nOr even try to school...",4.4,en,let fool ya even try school ya oh got mind go ...
2,Ivete Sangalo,Cruisin' (Part. Saulo),Pop,"Baby, let's cruise, away from here\nDon't be c...",4.4,en,baby let cruise away confused way clear want g...
3,Ivete Sangalo,Easy,Pop,"Know it sounds funny\nBut, I just can't stand ...",4.4,en,know sounds funny stand pain girl leaving tomo...
4,Ivete Sangalo,For Your Babies (The Voice cover),Pop,You've got that look again\nThe one I hoped I ...,4.4,en,got look one hoped lad face beaming smile got ...
...,...,...,...,...,...,...,...
182627,Sleater-Kinney,Words And Guitar,Rock Alternativo,words + guitar\ni got it words + guitar\ni lik...,0.0,en,words guitar got words guitar like way way lou...
182628,Sleater-Kinney,"Write Me Back, Fucker",Rock Alternativo,i got your letter today\ni read the things you...,0.0,en,got letter today read things say things took b...
182629,Sleater-Kinney,You Ain't It,Rock Alternativo,you ...ain't it!\nyou ...ain't it!\n\nyou're t...,0.0,en,hottest band around biggest dicks town mean sh...
182630,Sleater-Kinney,You're No Rock N' Roll Fun,Rock Alternativo,You're no rock n' roll fun\nlike a party that'...,0.0,en,rock n roll fun like party begun walk park lik...


In [11]:
# reset indices
df_english_top_10_genres = df_english_top_10_genres.reset_index(drop = True)
df_english_top_10_genres

Unnamed: 0,Artist,SName,Genre,Lyric,Popularity,Language,Cleaned_Lyric
0,Ivete Sangalo,Careless Whisper,Pop,I feel so unsure\nAs I take your hand and lead...,4.4,en,feel unsure take hand lead dance floor music d...
1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,Pop,"Don't let them fool, ya\nOr even try to school...",4.4,en,let fool ya even try school ya oh got mind go ...
2,Ivete Sangalo,Cruisin' (Part. Saulo),Pop,"Baby, let's cruise, away from here\nDon't be c...",4.4,en,baby let cruise away confused way clear want g...
3,Ivete Sangalo,Easy,Pop,"Know it sounds funny\nBut, I just can't stand ...",4.4,en,know sounds funny stand pain girl leaving tomo...
4,Ivete Sangalo,For Your Babies (The Voice cover),Pop,You've got that look again\nThe one I hoped I ...,4.4,en,got look one hoped lad face beaming smile got ...
...,...,...,...,...,...,...,...
110685,Sleater-Kinney,Words And Guitar,Rock Alternativo,words + guitar\ni got it words + guitar\ni lik...,0.0,en,words guitar got words guitar like way way lou...
110686,Sleater-Kinney,"Write Me Back, Fucker",Rock Alternativo,i got your letter today\ni read the things you...,0.0,en,got letter today read things say things took b...
110687,Sleater-Kinney,You Ain't It,Rock Alternativo,you ...ain't it!\nyou ...ain't it!\n\nyou're t...,0.0,en,hottest band around biggest dicks town mean sh...
110688,Sleater-Kinney,You're No Rock N' Roll Fun,Rock Alternativo,You're no rock n' roll fun\nlike a party that'...,0.0,en,rock n roll fun like party begun walk park lik...


#### Label Creation

In [12]:
# create labels
genre_labels = []
for genre in df_english_top_10_genres['Genre'].tolist():
  genre_labels.append(top_10_genres.index(genre))

print("Length of Labels List: ", len(genre_labels))

Length of Labels List:  110690


In [13]:
df_english_top_10_genres['Label'] = genre_labels
print(df_english_top_10_genres['Label'].value_counts())

0    25177
1    13759
2    13496
3    12998
4     9589
5     9019
6     8411
7     7377
8     5555
9     5309
Name: Label, dtype: int64


#### Balancing Dataset

In [14]:
# create a balanced dataset starting with top genre
df_balanced = df_english_top_10_genres[df_english_top_10_genres.Genre == top_10_genres[0]].sample(2000)

# get n random rows from each of the other top genres
for genre in top_10_genres[1:]:
  df_genre = df_english_top_10_genres[df_english_top_10_genres.Genre == genre].sample(2000)
  df_balanced = pd.concat([df_balanced, df_genre], ignore_index = True)

# ensure dataset is balanced
df_balanced['Genre'].value_counts()

Pop-Rock            2000
Rap                 2000
Pop                 2000
Rock                2000
Hip Hop             2000
Country             2000
Heavy Metal         2000
Rock Alternativo    2000
R&B                 2000
Indie               2000
Name: Genre, dtype: int64

#### Tokenization and Encoding

In [15]:
lyrics = df_balanced['Cleaned_Lyric'].tolist()

tokens = []
for lyric in lyrics:
  try:
    words = word_tokenize(lyric)
  except:
    print(lyric)

  tokens.append(words)

print("Length of Tokens List: ", len(tokens))

Length of Tokens List:  20000


In [16]:
encoded_tokens = []

# for every lyric
for line in tokens:
  encoded_line = []

  # for every word in lyric
  for word in line:
    # set to id value
    encoded_line.append(word_to_id.get(word))

  encoded_tokens.append(encoded_line)

print("Length of Encoded Tokens: ", len(encoded_tokens))

Length of Encoded Tokens:  20000


#### Padding

In [17]:
# get max length in list
max_length = len(max(encoded_tokens, key = len))
# pad training data
X = pad_sequences(encoded_tokens, maxlen = max_length)

In [18]:
# convert to numpy arrays
X = np.array(X)
y = np.array(df_balanced['Label'].tolist())

# one hot encode labels
y = OneHotEncoder().fit_transform(y.reshape(-1, 1)).toarray()

### Model Creation

In [19]:
# global variables
EPOCHS = 5
DROPOUT = 0.2
LEARNING_RATE = 0.003
BATCH_SIZE = 100

VOCAB_SIZE = len(vocabulary)
OUTPUT_SIZE = 10
N_LAYERS = 2
HIDDEN_DIM = 128
EMBEDDING_DIM = 100

In [20]:
def build_model(X):

  # set input layer
  input_layer = tf.keras.Input((X.shape[1],))

  # add embedding layer
  m = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix])(input_layer)

  # add LSTM layer
  m = tf.keras.layers.LSTM(HIDDEN_DIM, return_sequences=True)(m)

  # add attention layer
  m = tf.keras.layers.Attention()([m, m])

  # add global average pooling layer
  m = tf.keras.layers.GlobalAveragePooling1D()(m)

  # add dropout layer
  m = tf.keras.layers.Dropout(DROPOUT)(m)

  # add linear layer
  m = tf.keras.layers.Dense(OUTPUT_SIZE, activation='softmax')(m)

  # build model
  model = tf.keras.models.Model(inputs = input_layer, outputs = m)

  # compile model
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['AUC', 'Precision', 'Accuracy', 'Recall'])

  return model

In [21]:
# keras.utils.plot_model(model, show_shapes = True)

### Model Execution

In [None]:
# track k fold history
history = []

kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(X):

  print("TRAIN:", train_index, "TEST:", test_index)

  X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

  model = build_model(X_train)

  model.summary()

  h = model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_split = 0.2)

  history.append(h)

TRAIN: [ 2000  2001  2002 ... 19997 19998 19999] TEST: [   0    1    2 ... 1997 1998 1999]
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1657)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1657, 100)            1861260   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 lstm (LSTM)                 (None, 1657, 128)            117248    ['embedding[0][0]']           
                                                                                                  
 at

### Results

In [None]:
for (i, h) in enumerate(history, start = 1):
  print("Begin KFold: ", i)

  print(h.history['loss'])
  print(h.history['val_loss'])

  print(h.history['auc'])
  print(h.history['val_auc'])

  print(h.history['recall'])
  print(h.history['val_recall'])

  print(h.history['precision'])
  print(h.history['val_precision'])

  print(h.history['Accuracy'])
  print(h.history['val_Accuracy'])

  print("End KFold: ", i)