In [4]:
# !python -m pip install xgboost

In [5]:
# !python -m pip install svgling

In [6]:
# !python -m pip install scikit-multilearn

In [8]:
import requests
import json
from pandas import json_normalize
import ast

import pandas as pd
import numpy as np
import re
import string
import itertools
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate, KFold, cross_val_score, cross_validate, StratifiedKFold

from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ne_chunk, pos_tag, word_tokenize
from wordcloud import WordCloud
import gensim
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import ggplot, aes, geom_line, geom_point, geom_col, geom_bar, geom_density, geom_hline, geom_vline, geom_text, theme, theme_minimal, labs, coord_flip, scale_fill_brewer, scale_color_manual, scale_x_discrete, scale_x_continuous, scale_y_continuous, position_stack

In [9]:
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /Users/julianadaikawa/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/julianadaikawa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/julianadaikawa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/julianadaikawa/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [11]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/julianadaikawa/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [12]:
data = pd.read_csv('songs_lyrics_post_preprocessing.csv')
data['lyrics_nlp'] = data.lyrics_nlp.fillna('')

In [13]:
data.groupby('topic')['name'].count().reset_index().sort_values('name',ascending=False)

Unnamed: 0,topic,name
25,Songs about heartache,769
54,Songs written for a girl,512
40,Songs about sex,489
42,Songs about spirituality or religion,476
3,Songs about an ex-girlfriend or ex-boyfriend,473
0,Songs about a breakup,396
1,Songs about a mother or father,385
19,Songs about drugs,361
52,Songs that are tributes to friends who died,353
15,Songs about death,331


In [14]:
# Label encoding the target to train the models
data['topic_id'] = data['topic'].factorize()[0]
topic_id_df = data[['topic', 'topic_id']].drop_duplicates()

# Dictionaries for future use
topic_to_id = dict(topic_id_df.values)
id_to_topic = dict(topic_id_df[['topic_id', 'topic']].values)

In [15]:
# Removing rare words. Word should appear at least at 5 different songs

data_wo_repetition = data.groupby(['name','artist'])['lyrics_nlp'].max().reset_index()
allwords_list = data_wo_repetition['lyrics_nlp'].apply(lambda x: list(set(x.replace(',', ' ').split()))).values
all_words = [item for sublist in allwords_list for item in sublist]
freq_dist = nltk.FreqDist(all_words)
allwords = freq_dist.most_common(len(all_words))
words_over = list(filter(lambda x: x[1]>=5, allwords)) # At least in 5 of songs
rarewords = words_over[-5:]
print('All words: ',len(allwords))
print('Over 5: ', len(words_over))

All words:  15340
Over 5:  6435


In [16]:
# Most frequent words
words_over[:10]

[('go', 7990),
 ('know', 7470),
 ('get', 7465),
 ('like', 6343),
 ('say', 5719),
 ('love', 5501),
 ('see', 5301),
 ('make', 5299),
 ('time', 5264),
 ('come', 5229)]

In [17]:
# Least frequent words
words_over[-10:]

[('deceitful', 5),
 ('auction', 5),
 ('recourse', 5),
 ('burrow', 5),
 ('badness', 5),
 ('militia', 5),
 ('exaggerate', 5),
 ('unsteady', 5),
 ('geneva', 5),
 ('warp', 5)]

In [18]:
# We will separate a random portion of the dataset to test our model later.
train, test = train_test_split(data, test_size=0.1, random_state=42)

In [19]:
# Now let's stack lyrics for each topic
train_grouped = train.groupby('topic')['lyrics_nlp'].apply(list).reset_index(name='lyrics')
train_grouped['lyrics'] = train_grouped['lyrics'].apply(lambda x: ' '.join(x))
train_grouped['bow_lyrics'] = train_grouped['lyrics'].apply(lambda x: nltk.WordPunctTokenizer().tokenize(x))

In [20]:
# vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer = TfidfVectorizer(use_idf=True)
vectors = vectorizer.fit_transform(train.lyrics_nlp.values)
idf = vectorizer.idf_
vectors_test = vectorizer.transform(test.lyrics_nlp.values)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
X_train = pd.DataFrame(denselist, columns=feature_names)
dense_test = vectors_test.todense()
denselist_test = dense_test.tolist()
X_test = pd.DataFrame(denselist_test, columns=feature_names)
y_train = train.topic_id.values
y_test = test.topic_id.values

# Removing rare words
n_min_docs = 5
word_to_remove = list(filter(lambda x: x[1]<n_min_docs, allwords)) 
word_to_remove = [w for w,v in word_to_remove]
X_train = X_train[[col for col in X_train.columns if col not in word_to_remove]]
X_test = X_test[[col for col in X_test.columns if col not in word_to_remove]]
print(X_train.shape)

(12569, 6435)


In [21]:
X_train # 6436

Unnamed: 0,abandon,abide,ability,ablaze,able,aboard,abort,abortion,abound,abroad,...,younger,youth,youthful,yuh,zip,zipper,zombie,zone,zoo,zoom
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12564,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12565,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12566,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12567,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [380]:
# Finding the three most correlated terms with each of the product categories
N = 3
topic_col = []
unigrams_col = []
topic_unigrams_df = pd.DataFrame()
for topic, topic_id in sorted(topic_to_id.items()):
  features_chi2 = chi2(denselist, train.topic_id == topic_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(vectorizer.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  topic_col.append(topic)
  unigrams_col.append(', '.join(unigrams[-N:]))
#   bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#   print("\n==> %s:" %(topic))
#   print("  * Most Correlated Unigrams are: %s" %(', '.join(unigrams[-N:])))
#   print("  * Most Correlated Bigrams are: %s" %(', '.join(bigrams[-N:])))

topic_unigrams_df['topic'] = topic_col
topic_unigrams_df['unigrams'] = unigrams_col
topic_unigrams_df

Unnamed: 0,topic,unigrams
0,Songs about a breakup,"foo, unfriend, bye"
1,Songs about a mother or father,"father, daddy, mother"
2,Songs about alcohol,"tequila, beer, drink"
3,Songs about an ex-girlfriend or ex-boyfriend,"lo, rekindle, ex"
4,Songs about being away from loved ones,"miss, wanderer, home"
5,Songs about being free,"bondage, freedom, free"
6,Songs about being there for someone,"lade, generator, woohoo"
7,Songs about being young and confused,"sobriety, ripcord, naivety"
8,Songs about change,"evacuation, leathery, change"
9,Songs about cheating,"messin, womanizer, cheat"


In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

seed=42
nb_modeldefault = GaussianNB()
lr_modeldefault = LogisticRegression(random_state=seed, penalty='l2', C=1.0, max_iter=1000) 
rf_modeldefault = RandomForestClassifier(random_state=seed, n_estimators=100, criterion='gini', max_depth=10, min_samples_split=5, min_samples_leaf=5)
xgb_modeldefault = XGBClassifier(random_state=seed, learning_rate=0.1, n_estimators=100, max_depth=10, subsample=0.8, colsample_bytree=0.8)

Naive Bayes

In [441]:
model = nb_modeldefault
scoring = 'f1_macro'
results = []
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring, return_train_score=True)
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Macro F1 Train = ", cv_results['train_score'].mean()) 
print("Macro F1 Test = ", cv_results['test_score'].mean()) 

Accuracy Train =  0.809202790543484
Accuracy Test =  0.04232777225651588


Logistic Regression

In [445]:
model = lr_modeldefault
scoring = 'f1_macro'
results = []
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring, return_train_score=True)
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Macro F1 Train = ", cv_results['train_score'].mean()) 
print("Macro F1 Test = ", cv_results['test_score'].mean()) 

Accuracy Train =  0.3842708368821527
Accuracy Test =  0.12719305895750127


Random Forest

In [443]:
model = rf_modeldefault
scoring = 'f1_macro'
results = []
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring, return_train_score=True)
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Macro F1 Train = ", cv_results['train_score'].mean()) 
print("Macro F1 Test = ", cv_results['test_score'].mean()) 

Accuracy Train =  0.09089776696531984
Accuracy Test =  0.0347831820674238


In [39]:
random_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10]
    }
clf = RandomizedSearchCV(RandomForestClassifier(), random_grid, random_state=seed, n_iter=100, cv=5, verbose=10, return_train_score=True, scoring='f1_macro')
clf.fit(X_train,y_train)
clf.best_params_
clf_final = RandomForestClassifier(**clf.best_params_).fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5; 1/100] START max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600




[CV 1/5; 1/100] END max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=(train=0.892, test=0.080) total time=  51.8s
[CV 2/5; 1/100] START max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600




[CV 2/5; 1/100] END max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=(train=0.891, test=0.087) total time=  51.9s
[CV 3/5; 1/100] START max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600




[CV 3/5; 1/100] END max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=(train=0.892, test=0.082) total time=  51.1s
[CV 4/5; 1/100] START max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600




[CV 4/5; 1/100] END max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=(train=0.890, test=0.083) total time=  52.5s
[CV 5/5; 1/100] START max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600




[CV 5/5; 1/100] END max_depth=90, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=(train=0.900, test=0.084) total time=  52.2s
[CV 1/5; 2/100] START max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300




[CV 1/5; 2/100] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=(train=0.117, test=0.036) total time=   5.8s
[CV 2/5; 2/100] START max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300




[CV 2/5; 2/100] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=(train=0.114, test=0.037) total time=   5.6s
[CV 3/5; 2/100] START max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300




[CV 3/5; 2/100] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=(train=0.112, test=0.040) total time=   5.5s
[CV 4/5; 2/100] START max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300




[CV 4/5; 2/100] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=(train=0.113, test=0.032) total time=   5.7s
[CV 5/5; 2/100] START max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300




[CV 5/5; 2/100] END max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=300;, score=(train=0.114, test=0.034) total time=   5.7s
[CV 1/5; 3/100] START max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300




[CV 1/5; 3/100] END max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=(train=0.939, test=0.075) total time=  33.9s
[CV 2/5; 3/100] START max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300




[CV 2/5; 3/100] END max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=(train=0.935, test=0.083) total time=  34.2s
[CV 3/5; 3/100] START max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300




[CV 3/5; 3/100] END max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=(train=0.934, test=0.080) total time=  33.8s
[CV 4/5; 3/100] START max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300




[CV 4/5; 3/100] END max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=(train=0.935, test=0.077) total time=  34.5s
[CV 5/5; 3/100] START max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300




[CV 5/5; 3/100] END max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=300;, score=(train=0.941, test=0.085) total time=  34.2s
[CV 1/5; 4/100] START max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200




[CV 1/5; 4/100] END max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=(train=0.938, test=0.079) total time=  22.7s
[CV 2/5; 4/100] START max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200




[CV 2/5; 4/100] END max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=(train=0.932, test=0.088) total time=  22.3s
[CV 3/5; 4/100] START max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200




[CV 3/5; 4/100] END max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=(train=0.934, test=0.091) total time=  22.2s
[CV 4/5; 4/100] START max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200




[CV 4/5; 4/100] END max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=(train=0.934, test=0.080) total time=  22.3s
[CV 5/5; 4/100] START max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200




[CV 5/5; 4/100] END max_depth=70, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=(train=0.941, test=0.085) total time=  22.0s
[CV 1/5; 5/100] START max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900
[CV 1/5; 5/100] END max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900;, score=(train=0.168, test=0.051) total time=  29.0s
[CV 2/5; 5/100] START max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900
[CV 2/5; 5/100] END max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900;, score=(train=0.170, test=0.055) total time=  28.9s
[CV 3/5; 5/100] START max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900
[CV 3/5; 5/100] END max_depth=20, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=900;, score=(train=0.171, test=0.055) total time=



[CV 1/5; 6/100] END max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=(train=0.859, test=0.073) total time=  22.9s
[CV 2/5; 6/100] START max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300




[CV 2/5; 6/100] END max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=(train=0.852, test=0.085) total time=  23.0s
[CV 3/5; 6/100] START max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300




[CV 3/5; 6/100] END max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300;, score=(train=0.861, test=0.075) total time=  23.0s
[CV 4/5; 6/100] START max_depth=60, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=300




In [None]:
clf.best_params_

In [None]:
pred_train = clf_final.predict(X_train)
pred_test = clf_final.predict(X_test)

In [None]:
classification_report_df_RF = pd.DataFrame(classification_report([id_to_topic[i] for i in y_test], 
                                                              [id_to_topic[i] for i in pred_test], 
                                                              output_dict=True)).transpose()
classification_report_df_RF.sort_values('f1-score', ascending=False)

In [None]:
classification_report_df_RF_train = pd.DataFrame(classification_report([id_to_topic[i] for i in y_train], 
                                                              [id_to_topic[i] for i in pred_train], 
                                                              output_dict=True)).transpose()
classification_report_df_RF_train.sort_values('f1-score', ascending=False)

XGBoost

In [430]:
# xgb_modeldefault.fit(X_train, y_train)
# # predict
# xgb_predictions = xgb_modeldefault.predict(X_test)
# # accuracy
# print("Accuracy Train = ", accuracy_score(y_train,xgb_modeldefault.predict(X_train))) 
# print("Accuracy Test = ", accuracy_score(y_test,xgb_predictions)) 

In [371]:
test_df = pd.concat([X_test,pd.DataFrame({'real':[id_to_topic[i] for i in y_test]}),pd.DataFrame({'pred':[id_to_topic[i] for i in predictions]})],axis=1)

In [368]:
classification_report_df = pd.DataFrame(classification_report([id_to_topic[i] for i in y_test], 
                                                              [id_to_topic[i] for i in predictions], 
                                                              output_dict=True)).transpose()
classification_report_df.sort_values('f1-score', ascending=False)



Unnamed: 0,precision,recall,f1-score,support
Songs about war,0.368421,0.5,0.424242,28.0
Songs about alcohol,0.333333,0.352941,0.342857,17.0
Songs about a mother or father,0.254902,0.433333,0.320988,30.0
Songs about computers or technology,0.5,0.166667,0.25,12.0
Songs about loneliness or isolation,0.32,0.195122,0.242424,41.0
Songs about sex,0.159091,0.477273,0.238636,44.0
Songs about clubbing or a night out,0.222222,0.25,0.235294,16.0
Songs about heartache,0.15035,0.5375,0.234973,80.0
Songs about desire or longing,0.384615,0.166667,0.232558,30.0
Songs about drugs,0.191489,0.290323,0.230769,31.0
