# Simple logistic regression with lyrics

In [1]:
import pandas as pd
import numpy as np
import collections
import re
import sys
import time
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import wordninja
from itertools import islice

In [2]:
df = pd.DataFrame(pd.read_pickle('df_music'))

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5100 entries, 0 to 5099
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Rank              5100 non-null   int64  
 1   Song              5100 non-null   object 
 2   Artist            5100 non-null   object 
 3   Year              5100 non-null   int64  
 4   Lyrics            4913 non-null   object 
 5   Source            4913 non-null   float64
 6   Artists clean     5100 non-null   object 
 7   artist_song1      5100 non-null   object 
 8   songs_clean       5100 non-null   object 
 9   artist_song2      5100 non-null   object 
 10  danceability      5083 non-null   object 
 11  energy            5083 non-null   object 
 12  key               5083 non-null   object 
 13  loudness          5083 non-null   object 
 14  mode              5083 non-null   object 
 15  speechiness       5083 non-null   object 
 16  acousticness      5083 non-null   object 


In [4]:
df.shape

(5100, 29)

In [5]:
# drop rows with missing values

df = df[df.Lyrics != " NA "]
df.dropna( how='any', subset=['Lyrics'], inplace=True)
#reset index
df.reset_index(inplace=True, drop=True)
print("The number of remaining songs is: ", df.shape[0])

The number of remaining songs is:  4897


In [6]:
#generate some lyrics features 

In [7]:
lyrics=[]

for lyric in df.Lyrics: 
    lyric_string=re.sub('[^A-Za-z]+', ' ', lyric)
    lyrics_string = re.sub('/\s\s+/g', ' ', lyric)
    lyrics.append(lyric_string.lstrip())


In [8]:
lyrics[1]

'sugar pie honey bunch you know that i love you i cant help myself i love you and nobody elsein and out my life you come and you go leaving just your picture behind and i kissed it a thousand timeswhen you snap your finger or wink your eye i come arunning to you im tied to your apron strings and theres nothing that i can docant help myself no i cant help myselfsugar pie honey bunch im weaker than a man should be i cant help myself im a fool in love you seewanna tell you i dont love you tell you that were through and ive tried but every time i see your face i get all choked up insidewhen i call your name girl it starts the flame burning in my heart tearing it all apart no matter how i try my love i cannot hidecause sugar pie honey bunch you know that im weak for you cant help myself i love you and nobody elsesugar pie honey bunch do anything you ask me to cant help myself i want you and nobody elsesugar pie honey bunch you know that i love you i cant help myself i cant help myself '

In [9]:
# apply wordninja as some lyrics have words crunched together in one string
lyrics_clean=[]
for lyric in lyrics :
    string=wordninja.split(lyric)
    title=""
    for s in string:
        title+=s+" " 
    lyrics_clean.append(title.strip())

In [10]:
#add to df
df['lyrics_clean']=lyrics_clean

In [11]:
#clean

In [12]:
#apply lemmatizer and tokenizer
# apply word tokenizer, delete stopwords, and apply lemmatizer
tokens=[]
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

for row in lyrics_clean:
    row_tokens=word_tokenize(row)
    filtered_sent = [w for w in row_tokens if not w.lower() in stop_words]
    stemmed = [lemmatizer.lemmatize(word) for word in filtered_sent]
    tokens.append(stemmed)
df['tokens']=tokens

In [13]:
#apply tfidf vectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(tokens):
    return tokens

cv = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy,
    )  

x = cv.fit_transform(tokens)
words = cv.get_feature_names()
len(words)

16118

In [15]:
#apply tfidf vectorizer
# Create the tf-idf representation using the bag-of-words matrix
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transform = TfidfTransformer(norm=None)
X_tfidf = tfidf_transform.fit_transform(x)

In [16]:
#convert sparse tfidf matrix to df
import scipy.sparse
X_lyrics=pd.DataFrame.sparse.from_spmatrix(X_tfidf)

In [17]:
#save labels: ranking<50=1, 0 otherwise
y=pd.cut(df.Rank,bins=[0,50,100],labels=['top 50', 'bottom 50'])

In [18]:
#make test_train_split
from sklearn.model_selection import train_test_split


In [19]:
from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0):
    model = LogisticRegression(C=_C,  max_iter=10000).fit(X_train, y_train)
    score = model.score(X_test, y_test)
    #get most important features
    importance =dict(zip(X_train.columns, list(model.coef_[0])))
    sort={k: v for k, v in sorted(importance.items(), reverse=True, key=lambda item: item[1])}
    n_items=list(islice(sort.items(), 5))

    print('Test Score', score)
    print('5 most important items', n_items)


In [20]:
#simple logistic model of music lyrics word vectors only

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_lyrics,y, test_size=0.33, random_state=42)

In [22]:
simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0)

Test Score 0.5058750773036488
5 most important items [(12717, 0.5209945252581154), (10197, 0.4151657263701951), (6078, 0.40896256123679614), (12282, 0.4080987480728239), (828, 0.4025992997929635)]


In [23]:
#simple logistic regression only based on musical features

In [24]:
music_features=['danceability', 'key', 'loudness','energy','acousticness', 'speechiness', 'mode', 'instrumentalness', 'liveness','valence', 'tempo']
X_music=df[music_features]
X_music=X_music.fillna(X_music.mean())
X_music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      4897 non-null   float64
 1   key               4897 non-null   float64
 2   loudness          4897 non-null   float64
 3   energy            4897 non-null   float64
 4   acousticness      4897 non-null   float64
 5   speechiness       4897 non-null   float64
 6   mode              4897 non-null   float64
 7   instrumentalness  4897 non-null   float64
 8   liveness          4897 non-null   float64
 9   valence           4897 non-null   float64
 10  tempo             4897 non-null   float64
dtypes: float64(11)
memory usage: 421.0 KB


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_music,y, test_size=0.33, random_state=42)

In [26]:
simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0)

Test Score 0.5343228200371057
5 most important items [('danceability', 0.5341671733585669), ('acousticness', 0.40472554369139047), ('valence', 0.08016412424695919), ('loudness', 0.023661631102792048), ('tempo', -9.691288773170815e-05)]


In [27]:
#music features and lyrics

In [28]:
#join with musical features
X=X_music.join(pd.DataFrame(X_lyrics))
X.shape

(4897, 16129)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [30]:
simple_logistic_classify(X_train, y_train, X_test, y_test, _C=1.0)



Test Score 0.5034013605442177
5 most important items [('acousticness', 0.9936572578200047), ('valence', 0.8880772610119414), (12717, 0.541030798064886), (6078, 0.43369765880562205), (7175, 0.41593905376849094)]


In [31]:
#with grid search cv

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [33]:
#define parameter grid

C = np.logspace(-4, 4, 50)
penalty = ['l2']
logistic=LogisticRegression(max_iter=500)
parameters = dict(C=C, penalty=penalty)

In [34]:
#define function to perform logistic cv with different sets of features (text, music, both)

In [35]:
def logistic_cv(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
    gsl=GridSearchCV(logistic, parameters, cv=3, n_jobs=-1)
    gsl.fit(X_train, y_train)
    logisticCV=LogisticRegression(C=gsl.best_params_['C'], penalty=gsl.best_params_['penalty'], max_iter=1000)
    logisticCV.fit(X_train, y_train)
    y_pred=logisticCV.predict(X_test)
    print(gsl.best_score_)
    print(gsl.best_params_)
    print(accuracy_score(y_test, y_pred))

In [36]:
#Analysis only based on music features
logistic_cv(X_music,y)

0.5222589822888214
{'C': 10000.0, 'penalty': 'l2'}
0.5380333951762524


In [37]:
#Analysis only based on lyrics word vector features
logistic_cv(X_lyrics,y)

0.5125029758370395
{'C': 0.0062505519252739694, 'penalty': 'l2'}
0.5089672232529375


In [None]:
#Analysis only based on music features and lyrics word vector features
logistic_cv(X,y)