# Bag of words approach on lyrics_df

In [4]:
import pandas as pd

In [18]:
corpus = pd.read_csv('lyrics_df.csv', sep=';')
corpus.head()

Unnamed: 0,artist,title,lyrics
0,Florence and the Machine,100 years,I believe in you and in our hearts we know the...
1,Florence and the Machine,addicted to love,"The lights are on, but you're not home Your mi..."
2,Florence and the Machine,all this and heaven too,And the heart is hard to translate It has a la...
3,Florence and the Machine,are you hurting the one you love?,Are you hurting the one you love? You say you'...
4,Florence and the Machine,bedroom hymns,This is as good a place to fall as any We'll b...


## 1 - transform your corpus into a matrix (CountVectorizer)

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
cv = CountVectorizer(stop_words='english')
cv.fit(corpus['lyrics'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [49]:
lyrics_vec = cv.transform(corpus['lyrics'])

## 2 - Normalize the counts (TfidfTransformer)

In [50]:
from sklearn.feature_extraction.text import TfidfTransformer

In [51]:
tf = TfidfTransformer()
lyrics_vec_tf = tf.fit_transform(lyrics_vecs)

In [69]:
lyrics_df = pd.DataFrame(lyrics_vec_tf.todense().round(2),  columns=cv.get_feature_names(), index=corpus['artist'])

## 3 - Classification model

**First, add a labels column to your dataframe by factorizing the artist name**

In [73]:
# factorize label column
lyrics_df['artist_fact'] = pd.factorize(lyrics_df.index)[0]

In [68]:
lyrics_df.head()

Unnamed: 0_level_0,15,18,4th,ablaze,able,absolution,academy,accurate,acres,act,...,yes,yessir,yesterday,york,young,youngsters,youth,zero,zombie,artist_fact
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Florence and the Machine,0.0,0.0,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


**Define model data**

In [76]:
# Split the data into X and y
y = lyrics_df['artist_fact']
X = lyrics_df.drop('artist_fact', axis=1)

In [78]:
# Split it in training and test data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((137, 2638), (35, 2638), (137,), (35,))

**Create baseline with dummy classifier**

In [79]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_train, y_train)

0.5036496350364964

**Create logisitc regression classifier**

In [94]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=4)
lr.fit(X_train, y_train)
lr.score(X_train, y_train) 

1.0

**Cross validation**

In [95]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X_train, y_train, cv=4, scoring='accuracy')
scores

array([0.68571429, 0.70588235, 0.73529412, 0.70588235])

In [96]:
scores.mean().round(3), scores.std().round(3)

(0.708, 0.018)

**Hyperparameter Optimization**

In [97]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
    'C' : list(range(1, 5, 1)), # it will uniformly sample from that list
}
randomizedcv = RandomizedSearchCV(lr, param_distributions=param_distributions)
randomizedcv.fit(X_train, y_train)
random_results = pd.DataFrame(randomizedcv.cv_results_)
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_C']
random_results[columns].sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_C
3,0.708466,0.048849,0.013295,4
2,0.701058,0.056789,0.018166,3
0,0.693651,0.067102,0.017984,1
1,0.693651,0.067102,0.01551,2
