# Modelling part 1

## 1. Import libraries and explore data

In [65]:
#import libraries
import pandas as pd
import numpy as np
import collections
import re
import sys
import time
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import wordninja
from itertools import islice

In [66]:
df = pd.DataFrame(pd.read_pickle('df_music'))

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Rank              5100 non-null   int64  
 1   Song              5100 non-null   object 
 2   Artist            5100 non-null   object 
 3   Year              5100 non-null   int64  
 4   Lyrics            4913 non-null   object 
 5   Source            4913 non-null   float64
 6   Artists clean     5100 non-null   object 
 7   artist_song1      5100 non-null   object 
 8   songs_clean       5100 non-null   object 
 9   artist_song2      5100 non-null   object 
 10  danceability      5083 non-null   object 
 11  energy            5083 non-null   object 
 12  key               5083 non-null   object 
 13  loudness          5083 non-null   object 
 14  mode              5083 non-null   object 
 15  speechiness       5083 non-null   object 
 16  acousticness      5083 non-null   object 


In [68]:
df.shape

(5100, 29)

## 2. Drop songs without or with missing lyrics

In [69]:
# drop rows with missing values

df = df[df.Lyrics != " NA "]
df.dropna( how='any', subset=['Lyrics'], inplace=True)
#reset index
df.reset_index(inplace=True, drop=True)
print("The number of remaining songs is: ", df.shape[0])

The number of remaining songs is:  4897


In [70]:
#remove songs that are instrumental

df=df[df['Lyrics']!='instrumental'] 
df=df[df['Lyrics']!=' instrumental'] 
df=df[df['Lyrics']!=' instrumental '] 
df=df[df['Lyrics']!='instrumental ']
print("The number of remaining songs is: ", df.shape[0])

The number of remaining songs is:  4878


## 3. Clean lyrics and tokenize

In [71]:
# clean lyrics - remove punctuation

In [72]:
lyrics=[]

for lyric in df.Lyrics: 
    lyric_string=re.sub('[^A-Za-z]+', ' ', lyric)
    lyrics_string = re.sub('/\s\s+/g', ' ', lyric)
    lyrics.append(lyric_string.lstrip())


In [73]:
# apply wordninja as some lyrics have words crunched together in one string
lyrics_clean=[]
for lyric in lyrics :
    string=wordninja.split(lyric)
    title=""
    for s in string:
        title+=s+" " 
    lyrics_clean.append(title.strip())

In [74]:
#add to df
df['lyrics_clean']=lyrics_clean

In [75]:
#apply lemmatizer and tokenizer
# apply word tokenizer, delete stopwords, and apply lemmatizer
tokens=[]
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for row in lyrics_clean:
    row_tokens=word_tokenize(row)
    filtered_sent = [w for w in row_tokens if not w.lower() in stop_words]
    stemmed = [lemmatizer.lemmatize(word) for word in filtered_sent]
    tokens.append(stemmed)
df['tokens']=tokens

## 4. Make more features

In [76]:
# more features

#get no of unique words per song lyric
count_words=[]
for row in df.tokens:
    unique = set(row) 
    count_words.append(len(unique))
#and add to df
df["unique_words"]=count_words

#keep only rows where no of words !=0
df=df[df['unique_words']!=0]
df.shape[0]

4831

In [77]:
##get total no of words per song lyric
total_words=[]
for row in df.tokens:
    length = len(row) 
    total_words.append(length)
df['total_words']=total_words

In [78]:
## get avg word length
word_length=[]
for row in df.tokens:
    length_row = len(row)
    length_word=sum([len(word) for word in row])/length_row
    word_length.append(length_word)
df['word_length']=word_length

In [79]:
## rhyming words

#!pip install phyme

In [80]:
from Phyme import Phyme

ph = Phyme()

In [81]:
# find number of words per song that are perfect rhymes 
#perfect rhymes. DOG -> COG
def get_rhymes(word):
    list_dict=[value for key,value in ph.get_perfect_rhymes(word).items()]
    all_rhymes=[]
    for row in list_dict:
        for word in row:
            all_rhymes.append(word)
    all_rhymes=list(set(all_rhymes))
    return all_rhymes
    
def count_rhymes(text):
    rhyme_count=0
    for word in list(set(text)):
        try:
            all_rhymes=get_rhymes(word) # get list of all words that rhyme
            for x in all_rhymes: # for each of these words, check if it is contained in the lyrics
                if x in text:
                    rhyme_count+=1
                else: 
                    pass
        except:
            pass
    return rhyme_count          
        

In [82]:
#apply functions to lists of lyric tokens
count_rhyming_words=[]
for lyric in df.tokens:
    count=count_rhymes(lyric)
    count_rhyming_words.append(count)

In [83]:
df['count_rhyming_words']=count_rhyming_words

In [84]:
#apply tfidf vectorizer after count vecotizer with n_gram_range==5

In [85]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(tokens):
    return tokens

cv = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
        ngram_range=(1, 5)
    )  
tokens=df.tokens
x = cv.fit_transform(tokens)
words = cv.get_feature_names()
len(words)

1862379

In [86]:
#apply tfidf vectorizer 
# Create the tf-idf representation using the bag-of-words matrix
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transform = TfidfTransformer(norm=None)

X_tfidf = tfidf_transform.fit_transform(x)

## 5. Join music features, lyrics features and tfidf features

In [87]:
music_features=['danceability', 'key', 'loudness','energy','acousticness', 'speechiness', 'mode', 'instrumentalness', 'liveness','valence', 'tempo']
X_music=df[music_features]
X_music=X_music.fillna(X_music.mean())


In [88]:
#list of lyrics features
#also include year as a feature
add_features=['word_length', 'unique_words', 'total_words','count_rhyming_words', 'Year']

In [89]:
#join lyrics data with musical features
#stack sparse matrices
from scipy.sparse import hstack
import scipy.sparse
X_m=scipy.sparse.csr_matrix(X_music.values)
print(X_m.shape)

X_l=scipy.sparse.csr_matrix(df[add_features].values)
print(X_l.shape)

print(X_tfidf.shape)
X=hstack((X_m, X_tfidf, X_l))
print(X.shape)


(4831, 11)
(4831, 5)
(4831, 1862379)
(4831, 1862395)


In [90]:
#save labels: ranking<50=1, 0 otherwise
y=pd.cut(df.Rank,bins=[0,50,100],labels=[1,0])
df['y']=y
df[['y','Rank']].groupby('y').mean()
len(y)

4831

## 5. Build models

### simple

In [91]:
#make test_train_split
from sklearn.model_selection import train_test_split


In [92]:
# Initiate Logistic Classifier

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0):
    model = LogisticRegression(C=_C,  max_iter=10000).fit(X_train, y_train)
    score_test=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    #get most important features
    #importance =dict(zip(X_train.columns, list(model.coef_[0])))
    #sort={k: v for k, v in sorted(importance.items(), reverse=True, key=lambda item: item[1])}
   # n_items=list(islice(sort.items(), 5))

    print('ROC-AUC Score Test', score_test)
    #print('5 most important items', n_items)
    

In [104]:
# initiate random forest classifier
from sklearn.ensemble import RandomForestClassifier

def simple_rf_classify(X_train, y_train, X_test, y_test):
    model =  RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
    score_test=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    #get most important features
    #importance =dict(zip(X_train.columns, list(model.coef_[0])))
    #sort={k: v for k, v in sorted(importance.items(), reverse=True, key=lambda item: item[1])}
    #n_items=list(islice(sort.items(), 5))
    print('ROC-AUC Score Test', score_test)

In [96]:
#simple logistic model of music lyrics word vectors, lyrics features and musical features 

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [98]:
import warnings
warnings.filterwarnings('ignore')

In [99]:
simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0)

ROC-AUC Score Test 0.49145978605824425


In [105]:
simple_rf_classify(X_train, y_train, X_test, y_test)

ROC-AUC Score Test 0.5227214825578653


### Hyperparameter tuning

#### Logistic regression

In [116]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve

In [117]:
#define parameter grid for logistic cv

C = np.logspace(-4, 4, 50)
penalty = ['l2']

parameters = dict(C=C, penalty=penalty)

In [134]:
def logistic_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
    logistic=LogisticRegression(max_iter=500)
    gsl=GridSearchCV(logistic, parameters, cv=3, n_jobs=-1, scoring="roc_auc")
    gsl.fit(X_train, y_train)
    
    return gsl


    

In [None]:
#Logistic Classifier
lr=logistic_cv(X,y)

In [None]:
#fit tuned model
    
print("tuned hyperparameters :(best parameters) ", gsl.best_params_)
print("ROC AUC :", gsl.best_params_)

logisticCV=LogisticRegression(C=gsl.best_params_['C'], penalty=gsl.best_params_['penalty'], max_iter=500)
logisticCV.fit(X_train, y_train)

  
#return predicted probabilities
lr_probs = logisticCV.predict_proba(X_test)

    # keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
roc_auc=roc_auc_score(y_test, lr_probs)

    # get tpr, fpr
fpr, tpr, _ = roc_curve(y_test, lr_probs)


print("Tuned random logit  model ", roc_auc)


#### Random forest

In [None]:
#random forest model
#define parameter grid for randomized search with forest model

from sklearn.model_selection import RandomizedSearchCV

#n_estimators
n_estimators = [250,300, 350]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]


# Create the random grid
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'n_estimators':n_estimators}


In [None]:
def rf_rs(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
    #Instantiate the classifier
    rf=RandomForestClassifier(n_jobs=-1)
    rs=RandomizedSearchCV(rf,random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='roc_auc')
    rs.fit(X_train, y_train)
    
    return rs

In [None]:
#Random forest classifier
rf=rf_rs(X,y)

In [None]:
# fit tuned model
print("tuned hyperparameters :(best parameters) ", rs.best_params_)
print("ROC AUC :", rs.best_params_)


rfCV=RandomForestClassifier(max_features=rf.best_params_['max_features'], max_depth=rf.best_params_['max_depth'], min_samples_split=rf.best_params_['min_samples_split'], min_samples_leaf=rf.best_params_['min_samples_leaf'], bootstrap=rf.best_params_['bootstrap'],n_estimators=rf.best_params_['n_estimators'])
rfCV.fit(X_train, y_train)

#y_pred=logisticCV.predict(X_test)
    
#return predicted probabilities
probs = rfCV.predict_proba(X_test)
    # keep probabilities for the positive outcome only
probs = probs[:, 1]
roc_auc=roc_auc_score(y_test, probs)

    # get tpr, fpr
fpr, tpr, _ = roc_curve(y_test, probs)
    
print("Tuned random forest model ", roc_auc)

In [None]:
#drawing roc curve and auc


from matplotlib import pyplot

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]

## plot the roc curve: start with no skill prediction
fig, ax = plt.subplots(figsize=(15, 10))
ns_auc = roc_auc_score(y_test, ns_probs) #no skill
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')


#draw roc curves for tuned models
def draw_roc_curve(model, label):

    #get predicted probabilities
    prob=list(model[1])
    # calculate roc auc scores
    
    auc = roc_auc_score(y_test, prob)
    # calculate roc curves
    fpr=model[2]
    tpr=model[3]
    print('Model=%s: ROC AUC=%.3f' % (label, auc)) 
    plt.plot(fpr, tpr, marker='.', label=label)

draw_roc_curve(lr, 'Logistic regression, tuned')
draw_roc_curve(rf, 'Random forest, tuned')

# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()
