# Bag of words approach on lyrics_df

In [1]:
import pandas as pd

In [31]:
corpus = pd.read_csv('lyrics_df.csv', sep=';')
corpus

Unnamed: 0,artist,title,lyrics
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...
1,Florence and the Machine,addicted to love,The lights are on but you re not home Your mi...
2,Florence and the Machine,all this and heaven too,And the heart is hard to translate It has a la...
3,Florence and the Machine,are you hurting the one you love,Are you hurting the one you love You say you v...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as any We ll b...
...,...,...,...
561,ABBA,winner takes it all,I don t wanna talk about things we ve gone thr...
562,ABBA,wrap your arms around me,When you re so far away At the end of each day...
563,ABBA,you owe me one,Frida Now there s a shadow falling over our f...
564,ABBA,youre there,I tried not to see you Cause I couldn t take t...


## 1 - clean lyrics (Spacy)

In [30]:
import spacy
model = spacy.load('en_core_web_sm')

In [33]:
# Create function that cleans lyrics
def clean_lyrics(song):
    clean_song = []
    
    # parse the song through the spacy model
    tok_song = model(song)
    
    # loop through words and drop stopwords, drop punctuation
    for word in tokenised_song:
        if not word.is_stop and not word.is_punct:   # loop through words and drop stopwords, drop punctuation
            clean_song.append(word.lemma_)           # lemmatize the remaining words
    clean_song = ' '.join(clean_song)                # return the lemmatized version to the caller as a string
    return clean_song

In [34]:
corpus['lyrics_clean'] = clean_lyrics(corpus['lyrics'])

TypeError: Argument 'string' has incorrect type (expected str, got Series)

## 1 - transform your corpus into a matrix (CountVectorizer)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
cv = CountVectorizer(stop_words='english')
cv.fit(corpus['lyrics'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
lyrics_vec = cv.transform(corpus['lyrics'])

## 2 - Normalize the counts (TfidfTransformer)

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
tf = TfidfTransformer()
lyrics_vec_tf = tf.fit_transform(lyrics_vec)

In [10]:
lyrics_df = pd.DataFrame(lyrics_vec_tf.todense().round(2),  columns=cv.get_feature_names(), index=corpus['artist'])

## 3 - Classification model

**First, add a labels column to your dataframe by factorizing the artist name**

In [11]:
# factorize label column
lyrics_df['artist_fact'] = pd.factorize(lyrics_df.index)[0]

In [13]:
lyrics_df.head() # without spacy 6031 columns

Unnamed: 0_level_0,15,18,25,2x,39,4th,60,aa,aaa,aaaaaaaarghh,...,youth,yuh,yup,zarkov,zero,zip,zombie,zoo,zoomin,artist_fact
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


**Define model data**

In [14]:
# Split the data into X and y
y = lyrics_df['artist_fact']
X = lyrics_df.drop('artist_fact', axis=1)

In [15]:
# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((452, 6030), (114, 6030), (452,), (114,))

**Create baseline with dummy classifier**

In [16]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_train, y_train)

0.42035398230088494

**Create logisitc regression classifier**

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=4)
lr.fit(X_train, y_train)
lr.score(X_train, y_train) 

0.9933628318584071

**Cross validation**

In [25]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.63716814, 0.51327434, 0.5840708 , 0.5840708 ])

In [26]:
scores.mean().round(3), scores.std().round(3)

(0.58, 0.044)

**Hyperparameter Optimization**

In [27]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
    'C' : list(range(1, 5, 1)), # it will uniformly sample from that list
}
randomizedcv = RandomizedSearchCV(lr, param_distributions=param_distributions)
randomizedcv.fit(X_train, y_train)
random_results = pd.DataFrame(randomizedcv.cv_results_)
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_C']
random_results[columns].sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_C
2,0.577436,0.045721,0.298601,3
3,0.577436,0.046788,0.310343,4
1,0.575189,0.038805,0.261145,2
0,0.559707,0.02855,0.207176,1


**Test on validation data**

In [28]:
lr.score(X_val, y_val) 

0.6052631578947368