In [17]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Process Data

In [18]:
# grab data
data_train = pd.read_csv('lyrics_train.csv', names =['lyric','artist'],  dtype=np.object_)
data_test = pd.read_csv('lyrics_test.csv', names =['lyric','artist'],  dtype=np.object_)
# join the sets with new label
data_train['label'] = 'train'
data_test['label'] = 'test'
data = pd.concat([data_train, data_test])
# ensure lyrics are in string format
data['lyric'] = data['lyric'].astype(str)
# encode artist names
le = preprocessing.LabelEncoder()
data.artist = le.fit_transform(data.artist)

print(dict(zip(le.classes_, le.transform(le.classes_))))

{'Ed Sheeran': 0, 'Kendrick Lamar': 1}


# Vectorizing By Frequency

In [19]:
cv = CountVectorizer()
x_vec = pd.DataFrame(cv.fit_transform(data['lyric']).toarray())
data = pd.merge(data, x_vec, left_index=True, right_index=True)

# Naive Bayes Classifier

In [20]:
x_train = data.loc[data['label'] == 'train'].drop(['lyric', 'artist', 'label'], axis=1)
x_test = data.loc[data['label'] == 'test'].drop(['lyric', 'artist', 'label'], axis=1)
y_train = data.loc[data['label'] == 'train']['artist']
y_test = data.loc[data['label'] == 'test']['artist']

In [21]:
clf = MultinomialNB().fit(x_train, y_train)
predictions = clf.predict(x_test)

In [22]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int64)

In [23]:
np.array(list(y_test))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1])

In [24]:
str(metrics.accuracy_score(y_test,predictions) * 100)+'%' 

'30.434782608695656%'

In [25]:
str(metrics.f1_score(y_test,predictions) * 100)+'%' 

'4.761904761904762%'