In [2]:
import pandas as pd

In [7]:
csv = pd.read_csv('SongCSV_filtered.csv')
csv[['song_hotttnesss', 'Lyrics']].head()

Unnamed: 0,song_hotttnesss,Lyrics
0,0.0,INSTRUMENTAL
1,0.663194,It's time that I rain on your parade Watch as ...
2,0.530026,"You who's coming up the stairs, Shouting- I’m ..."
3,0.552548,Life is like A merry go round Painted horses R...
4,0.588922,INSTRUMENTAL


In [8]:
#Everything in lowercase
csv['Lyrics'] = csv['Lyrics'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
csv['Lyrics'].head()

0                                         instrumental
1    it's time that i rain on your parade watch as ...
2    you who's coming up the stairs, shouting- i’m ...
3    life is like a merry go round painted horses r...
4                                         instrumental
Name: Lyrics, dtype: object

In [9]:
#Removing punctuation that does not add meaning to the song
csv['Lyrics'] = csv['Lyrics'].str.replace('[^\w\s]','')
csv['Lyrics'].head()

0                                         instrumental
1    its time that i rain on your parade watch as a...
2    you whos coming up the stairs shouting im comi...
3    life is like a merry go round painted horses r...
4                                         instrumental
Name: Lyrics, dtype: object

In [10]:
#Removing punctuation that does not add meaning to the song
csv['Lyrics'] = csv['Lyrics'].str.replace('[^\w\s]','')
csv['Lyrics'].head()

0                                         instrumental
1    its time that i rain on your parade watch as a...
2    you whos coming up the stairs shouting im comi...
3    life is like a merry go round painted horses r...
4                                         instrumental
Name: Lyrics, dtype: object

In [11]:
#Removing of stop words
from nltk.corpus import stopwords

stop = stopwords.words('english')
csv['Lyrics'] = csv['Lyrics'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
csv['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hopes explode landmines...
2    whos coming stairs shouting im coming dying li...
3    life like merry go round painted horses riding...
4                                         instrumental
Name: Lyrics, dtype: object

In [18]:
#Number of RARE words. These words occur so rarely that their meaning don't really constitute anything.
freq = pd.Series(' '.join(csv['Lyrics']).split()).value_counts()[-10:]
freq

alkoi          1
caramel        1
sácate         1
learjet        1
inflation      1
apaguen        1
minneapolis    1
inte           1
hieno          1
hilfe          1
dtype: int64

In [19]:
#Removal of rare words
freq = list(freq.index)
csv['Lyrics'] = csv['Lyrics'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
csv['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hopes explode landmines...
2    whos coming stairs shouting im coming dying li...
3    life like merry go round painted horses riding...
4                                         instrumental
Name: Lyrics, dtype: object

In [21]:
#Correction of Spelling mistakes
from textblob import TextBlob
csv['Lyrics'] = csv['Lyrics'].apply(lambda x: str(TextBlob(x).correct()))

In [22]:
#Lemmatization is basically converting a word into its root word. It is preferred over Stemming.
from textblob import Word
csv['Lyrics'] = csv['Lyrics'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
csv['Lyrics'].head()

0                                         instrumental
1    time rain parade watch hope explode landmines ...
2    who coming stair shouting in coming dying like...
3    life like merry go round painted horse riding ...
4                                         instrumental
Name: Lyrics, dtype: object

In [None]:
df = pd.DataFrame(csv['Lyrics'])
df.head()

In [48]:
testCSV = pd.read_csv('test.csv')

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=100000, lowercase=True, ngram_range=(1,1),analyzer = "word").fit(testCSV['Lyrics'].values.astype(str))

In [50]:
len(bow.vocabulary_)

11760

In [51]:
lyrics_bow = bow.transform(testCSV['Lyrics'].values.astype(str))

In [52]:
print('Shape of Sparse Matrix: ', lyrics_bow.shape)

Shape of Sparse Matrix:  (2580, 11760)


In [53]:
lyrics_bow.nnz

86667

In [54]:
from sklearn.feature_extraction.text import TfidfTransformer

In [57]:
tfidf_transformer = TfidfTransformer().fit(lyrics_bow)

In [58]:
lyrics_tfidf = tfidf_transformer.transform(lyrics_bow)

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
model = MultinomialNB().fit(lyrics_tfidf, testCSV['hitOrNot'])

In [61]:
from sklearn.cross_validation import train_test_split

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(lyrics_tfidf, testCSV['hitOrNot'], test_size=0.5)

In [63]:
model.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
predictions = model.predict(X_test)

In [65]:
from sklearn.metrics import classification_report

In [66]:
print(classification_report(Y_test, predictions))

             precision    recall  f1-score   support

          0       0.60      1.00      0.75      1523
          1       0.00      0.00      0.00      1032

avg / total       0.36      0.60      0.45      2555



  'precision', 'predicted', average, warn_for)


In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
modelLog = LogisticRegression()

In [47]:
modelLog.fit(X_train, Y_train)
predictions = model.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(Y_test, predictions))

             precision    recall  f1-score   support

          0       0.60      1.00      0.75      1521
          1       0.00      0.00      0.00      1034

avg / total       0.35      0.60      0.44      2555



  'precision', 'predicted', average, warn_for)
