# Bag of words approach on lyrics_df

In [1]:
import pandas as pd

In [31]:
corpus = pd.read_csv('lyrics_df.csv', sep=';')
corpus

Unnamed: 0,artist,title,lyrics
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...
1,Florence and the Machine,addicted to love,The lights are on but you re not home Your mi...
2,Florence and the Machine,all this and heaven too,And the heart is hard to translate It has a la...
3,Florence and the Machine,are you hurting the one you love,Are you hurting the one you love You say you v...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as any We ll b...
...,...,...,...
561,ABBA,winner takes it all,I don t wanna talk about things we ve gone thr...
562,ABBA,wrap your arms around me,When you re so far away At the end of each day...
563,ABBA,you owe me one,Frida Now there s a shadow falling over our f...
564,ABBA,youre there,I tried not to see you Cause I couldn t take t...


## 1 - clean lyrics (Spacy)

In [30]:
import spacy
model = spacy.load('en_core_web_sm')

In [68]:
# Create function that cleans lyrics
def clean_lyrics(song):
    clean_song = []
    
    # parse the song through the spacy model
    tok_song = model(song)
    
    # loop through words and drop stopwords, drop punctuation
    for word in tok_song:
        if word.is_alpha and not word.is_stop:   # loop through words and drop stopwords, drop punctuation
            clean_song.append(word.lemma_)           # lemmatize the remaining words
    clean_song = ' '.join(clean_song)                # return the lemmatized version to the caller as a string
    return clean_song

In [69]:
corpus['lyrics_clean'] = corpus['lyrics'].apply(clean_lyrics)

In [70]:
corpus

Unnamed: 0,artist,title,lyrics,lyrics_clean
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...,believe heart know truth believe love darker g...
1,Florence and the Machine,addicted to love,The lights are on but you re not home Your mi...,light home mind heart sweat body shake kiss ta...
2,Florence and the Machine,all this and heaven too,And the heart is hard to translate It has a la...,heart hard translate language s talk tongue qu...
3,Florence and the Machine,are you hurting the one you love,Are you hurting the one you love You say you v...,hurt love ve find Heaven t find God hurt love ...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as any We ll b...,good place fall will build alter Maria m knee ...
...,...,...,...,...
561,ABBA,winner takes it all,I don t wanna talk about things we ve gone thr...,don t wanna talk thing ve go s hurt s history ...
562,ABBA,wrap your arms around me,When you re so far away At the end of each day...,far away end day count moment till fade away i...
563,ABBA,you owe me one,Frida Now there s a shadow falling over our f...,Frida s shadow fall face doubt forever heart w...
564,ABBA,youre there,I tried not to see you Cause I couldn t take t...,try Cause couldn t heartache past Wouldn t com...


## 2 - transform your corpus into a matrix (CountVectorizer)

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
cv = CountVectorizer(stop_words='english')
cv.fit(corpus['lyrics_clean'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [73]:
lyrics_vec = cv.transform(corpus['lyrics_clean'])

## 3 - Normalize the counts (TfidfTransformer)

In [74]:
from sklearn.feature_extraction.text import TfidfTransformer

In [75]:
tf = TfidfTransformer()
lyrics_vec_tf = tf.fit_transform(lyrics_vec)

In [76]:
lyrics_df = pd.DataFrame(lyrics_vec_tf.todense().round(2),  columns=cv.get_feature_names(), index=corpus['artist'])

## 4 - Classification model (Naive Bayes)

**First, add a labels column to your dataframe by factorizing the artist name**

In [83]:
# factorize label column
lyrics_df['artist_fact'] = pd.factorize(lyrics_df.index)[0]

In [84]:
lyrics_df # without spacy 6031 columns

Unnamed: 0_level_0,aa,aaa,aaaaaaaarghh,aaaah,aaah,aaargh,aah,abandoned,abel,abide,...,youth,yuh,yup,zarkov,zero,zip,zombie,zoo,zoomin,artist_fact
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


**Define model data**

In [85]:
# Split the data into X and y
y = lyrics_df['artist_fact']
X = lyrics_df.drop('artist_fact', axis=1)

In [86]:
# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((452, 4915), (114, 4915), (452,), (114,))

**Create baseline with dummy classifier**

In [87]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_train, y_train)
# dummy_clf.predict(X_train)

0.42035398230088494

**Create naive bayes classifier**

In [134]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.0025)
nb.fit(X_train, y_train)
nb.score(X_train, y_train)

0.995575221238938

**Cross validation**

In [135]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.69026549, 0.53097345, 0.59292035, 0.62831858])

In [136]:
scores.mean().round(3), scores.std().round(3)

(0.611, 0.058)

**Hyperparameter Optimization**

In [122]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
    'alpha' : [x/10000 for x in range(6, 27 , 1)], # it will uniformly sample from that list
}
randomizedcv = RandomizedSearchCV(nb, param_distributions=param_distributions)
randomizedcv.fit(X_train, y_train)
random_results = pd.DataFrame(randomizedcv.cv_results_)
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_alpha', 'rank_test_score']
random_results[columns].sort_values('rank_test_score', ascending=True)

Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_alpha,rank_test_score
0,0.619365,0.05515,0.029645,0.0024,1
6,0.619365,0.05515,0.013337,0.0025,1
1,0.617143,0.052931,0.013512,0.0023,3
7,0.617143,0.052462,0.013622,0.0016,3
2,0.614945,0.054086,0.013898,0.0006,5
5,0.614945,0.049409,0.013931,0.001,5
8,0.614945,0.049409,0.013358,0.0008,5
9,0.614921,0.050517,0.013111,0.0017,8
3,0.612698,0.052784,0.013364,0.002,9
4,0.612698,0.052784,0.013645,0.0021,9


**Test on validation data**

In [138]:
nb.score(X_val, y_val)

0.5877192982456141

In [142]:
artist_pred = nb.predict(X_train)
artist_pred_val = nb.predict(X_val)

In [143]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, artist_pred_val)


array([[ 9,  1,  8,  4],
       [ 2,  5,  9,  2],
       [ 3,  3, 34,  4],
       [ 1,  1,  9, 19]])