# Bag of words approach on lyrics_df

In [1]:
import pandas as pd

In [2]:
corpus = pd.read_csv('lyrics_df.csv', sep=';')
corpus

Unnamed: 0,artist,title,lyrics
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...
1,Florence and the Machine,addicted to love,The lights are on but youre not homeYour mind ...
2,Florence and the Machine,all this and heaven too,And the heart is hard to translateIt has a lan...
3,Florence and the Machine,are you hurting the one you love,Are you hurting the one you loveYou say youve ...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as anyWell bui...
...,...,...,...
561,ABBA,winner takes it all,I dont wanna talk about things weve gone throu...
562,ABBA,wrap your arms around me,When youre so far awayAt the end of each dayCo...
563,ABBA,you owe me one,Frida Now theres a shadow falling over our fa...
564,ABBA,youre there,I tried not to see youCause I couldnt take the...


## 1 - clean lyrics (Spacy)

In [3]:
import spacy
model = spacy.load('en_core_web_sm')

In [4]:
# Create function that cleans lyrics
def clean_lyrics(song):
    clean_song = []
    
    # parse the song through the spacy model
    tok_song = model(song)
    
    # loop through words and drop stopwords, drop punctuation
    for word in tok_song:
        if word.is_alpha and not word.is_stop:   # loop through words and drop stopwords, drop punctuation
            clean_song.append(word.lemma_)           # lemmatize the remaining words
    clean_song = ' '.join(clean_song)                # return the lemmatized version to the caller as a string
    return clean_song

In [5]:
corpus['lyrics_clean'] = corpus['lyrics'].apply(clean_lyrics)

In [6]:
corpus

Unnamed: 0,artist,title,lyrics,lyrics_clean
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...,believe heart know truth andI believe love dar...
1,Florence and the Machine,addicted to love,The lights are on but youre not homeYour mind ...,light homeYour mind ownYour heart sweat body s...
2,Florence and the Machine,all this and heaven too,And the heart is hard to translateIt has a lan...,heart hard translateit language ownit talk ton...
3,Florence and the Machine,are you hurting the one you love,Are you hurting the one you loveYou say youve ...,hurt loveYou have find Heaven not find GodAre ...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as anyWell bui...,good place fall anyWell build alter hereMake m...
...,...,...,...,...
561,ABBA,winner takes it all,I dont wanna talk about things weve gone throu...,not wanna talk thing have go throughthough hur...
562,ABBA,wrap your arms around me,When youre so far awayAt the end of each dayCo...,far awayat end daycounting moment till fade aw...
563,ABBA,you owe me one,Frida Now theres a shadow falling over our fa...,Frida s shadow fall facesdoubt forever heartsA...
564,ABBA,youre there,I tried not to see youCause I couldnt take the...,try youcause not heartache againThe past meWou...


## 2 - transform your corpus into a matrix (CountVectorizer)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer(stop_words='english')
cv.fit(corpus['lyrics_clean'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [9]:
lyrics_vec = cv.transform(corpus['lyrics_clean'])

## 3 - Normalize the counts (TfidfTransformer)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
tf = TfidfTransformer()
lyrics_vec_tf = tf.fit_transform(lyrics_vec)

In [12]:
lyrics_df = pd.DataFrame(lyrics_vec_tf.todense().round(2),  columns=cv.get_feature_names(), index=corpus['artist'])

## 4 - Classification model (LogisticRegression)

**First, add a labels column to your dataframe by factorizing the artist name**

In [13]:
# factorize label column
lyrics_df['artist_fact'] = pd.factorize(lyrics_df.index)[0]

In [14]:
lyrics_df # without spacy 6031 columns

Unnamed: 0_level_0,aaaahlast,aaaahsometime,aaah,aaahyoure,aaaooh,aah,aahbody,aahfight,aahgo,aahin,...,zero,zeroand,zerolet,zip,zombieim,zoo,zooi,zoomin,zootheyll,artist_fact
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


**Define model data**

In [51]:
# Split the data into X and y
y = lyrics_df['artist_fact']
X = lyrics_df.drop('artist_fact', axis=1)

In [52]:
# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((452, 6030), (114, 6030), (452,), (114,))

**Create baseline with dummy classifier**

In [55]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_train, y_train)
# dummy_clf.predict(X_train)

0.42035398230088494

**Create logisitc regression classifier**

In [56]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=4)
lr.fit(X_train, y_train)
lr.score(X_train, y_train) 

0.9933628318584071

**Cross validation**

In [57]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.63716814, 0.51327434, 0.5840708 , 0.5840708 ])

In [58]:
scores.mean().round(3), scores.std().round(3)

(0.58, 0.044)

**Hyperparameter Optimization**

In [59]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
    'C' : list(range(1, 5, 1)), # it will uniformly sample from that list
}
randomizedcv = RandomizedSearchCV(lr, param_distributions=param_distributions)
randomizedcv.fit(X_train, y_train)
random_results = pd.DataFrame(randomizedcv.cv_results_)
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_C']
random_results[columns].sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_C
2,0.577436,0.045721,0.290575,3
3,0.577436,0.046788,0.308,4
1,0.575189,0.038805,0.251352,2
0,0.559707,0.02855,0.231535,1


**Test on validation data**

In [60]:
lr.score(X_val, y_val)

0.6052631578947368