In [1]:
import pandas as pd
import nltk

In [2]:
# word tokenize will turn each word in a text string into a token
from nltk.tokenize import word_tokenize 

In [3]:
# punkt is a language aware model that can handle punctuation in text
nltk.download('punkt') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# i don't think there is any punctuation in the data that we are using to train this model but best to be safe

In [5]:
# there are definitely stop words in the data we will use to build the model (words that do not have sentiment)
from nltk.corpus import stopwords
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# this weighs the importance of a word based on freqeuency (feature extraction tool)
from sklearn.feature_extraction.text import TfidfVectorizer 

In [7]:
# we all know what this is
from sklearn.model_selection import train_test_split 

In [8]:
 # this is our model of choice
from sklearn.linear_model import LogisticRegression

In [9]:
# we will use this to see how accurate the model is
from sklearn.metrics import accuracy_score, classification_report 

In [10]:
# reading in labeled text data to make the model
emotions=pd.read_csv('emotions.csv') 

In [11]:
# basic data information
emotions.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [12]:
# here's what the data looks like
emotions.head() 

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [13]:
# sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [14]:
# we have some really unbalanced data here, this is what we adress first if we want to improve the model
emotions.label.value_counts() 

1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: label, dtype: int64

In [15]:
# i am writing a function that tokenizes text that we can apply to the text column
def tokenize(text):
    return nltk.word_tokenize(text.lower()) 

In [16]:
# applying the tokenizer function to create a tokens column
emotions['tokens']=emotions['text'].apply(tokenize) 

In [17]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [18]:
# we are creating a column of tokens that do not contain any words we do not want to feed the model
emotions['filtered_tokens'] = emotions['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [19]:
 # here's what our data looks like now with the new features
emotions.head()

Unnamed: 0,text,label,tokens,filtered_tokens
0,i just feel really helpless and heavy hearted,4,"[i, just, feel, really, helpless, and, heavy, ...","[feel, really, helpless, heavy, hearted]"
1,ive enjoyed being able to slouch about relax a...,0,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoyed, able, slouch, relax, unwind, fr..."
2,i gave up my internship with the dmrg and am f...,4,"[i, gave, up, my, internship, with, the, dmrg,...","[gave, internship, dmrg, feeling, distraught]"
3,i dont know i feel so lost,0,"[i, dont, know, i, feel, so, lost]","[dont, know, feel, lost]"
4,i am a kindergarten teacher and i am thoroughl...,4,"[i, am, a, kindergarten, teacher, and, i, am, ...","[kindergarten, teacher, thoroughly, weary, job..."


In [20]:
# what's happening here is we are creating a column combining the filtered tokens back together into a text string
emotions['text_combined'] = emotions['filtered_tokens'].apply(lambda x: ' '.join(x))

In [21]:
# i mentioned earlier that this is going to make the text readable for the logistic regression  model
# this is going to create vectors from our text that we can use for training
# the vectors are made with term frequency, relative to amount of documents, which in turn tells us something about their importance
vectorizer = TfidfVectorizer()

In [22]:
# vectorize the text
X = vectorizer.fit_transform(emotions['text_combined']) 

In [23]:
y=emotions['label']

In [24]:
# splitting the data into training data and testing data
# 42 was chosen arbitrarily
# random state accepts any number in range [-2,147,483,648 , 2,147,483,647]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Default is 'l2'
# i chose 200 for max_iter because it was large enough to reach convergence
# if you do not set the max_iter to 200, the default, 100 is not enough and you will get an error
model = LogisticRegression(penalty='l2', max_iter=200) 

In [26]:
# fit the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200)

In [27]:
# basic prediction function call
y_pred = model.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_pred)

In [29]:
# did pretty well
accuracy 

0.8939684748446535

In [30]:
# we aren't after the models prediction for the labels
# we are after the probability scores for each label because we want nuanced emotion vectors
# lets do a demo

In [31]:
# here is a string that i am going to write that will contain some things i want to say about my emotional state
hank_feeling = 'i had a stressfull day but i was able to get a lot done which made me proud'

In [32]:
# in order to pull an emotion vector from this string we need to apply the same principles we used to train the model
# we need to tokenize the text
# we need to filter the tokenized text for stopwords
# we need to re join that filtered token into plain text
# we need to extract features using tfidf vectorization
# we need to tell the model to predict probabilites NOT the label
# we're gonna investigate what the probabilities look like

In [33]:
#  recall this key; sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [34]:
# now let's write a function that does that whole process for us

In [35]:
def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [36]:
emotion_score(hank_feeling) # test drive

[0.014172429431486432,
 0.9566514104423582,
 0.008914546684369156,
 0.008687934455954609,
 0.0048888348324429304,
 0.00668484415338865]

In [37]:
# i wanna run this function on an entire data frame of song lyrics
music=pd.read_csv('songs_with_lyrics_Cleaned.csv') 

In [38]:
 # here's what the data frame with music data and song lyrics
music.head()

Unnamed: 0.1,Unnamed: 0,artist,song,link,text
0,0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face and it..."
1,1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please touch me gently l..."
2,2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go why i had to p...
3,3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...


In [39]:
# demo for how the song handles user input
emotion_score(' i had a pretty rough day, can you recommend me a sad song?')

[0.7344534623529986,
 0.13323336314786657,
 0.02990271648282762,
 0.04125990395442898,
 0.03311831665915752,
 0.02803223740272064]

In [40]:
# write a function that vectorizes the song lyric data with an emotion score dictionary in order to create a table for the lyric scores

In [41]:
def emotion_dictionary(user_input):
    # make tokens
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter the tokens
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # recombine filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # tfidf feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # call the probabilities
    probabilities = model.predict_proba(input_vector) 
    # create the vector
    emotion_vector = probabilities.tolist()[0] 
    # use the vector to create a dictionary with the scores and a key that references the scores
    return {'sadness': emotion_vector[0], 
    'joy': emotion_vector[1],
    'love': emotion_vector[2],
    'anger': emotion_vector[3],
    'fear': emotion_vector[4],
    'surprise': emotion_vector[5] }

In [42]:
# time to make the score table

In [43]:
# import tqdm for progress bar
from tqdm import tqdm 

In [44]:
# call this to get the progress bar
tqdm.pandas() 

In [45]:
# get a series containing the emotion scores for every song
scores=music['text'].progress_apply(emotion_dictionary).apply(pd.Series) 

100%|███████████████████████████████████████████████████████████████████████████| 44795/44795 [05:33<00:00, 134.13it/s]


In [46]:
# here is the scores data series that we will concat to our original data frame
scores 

Unnamed: 0,sadness,joy,love,anger,fear,surprise
0,0.035792,0.868977,0.019480,0.032565,0.023263,0.019923
1,0.064300,0.637787,0.152012,0.053045,0.060365,0.032491
2,0.537424,0.316863,0.072477,0.022086,0.027428,0.023722
3,0.134805,0.365129,0.266093,0.142332,0.064678,0.026963
4,0.967412,0.005209,0.016959,0.006574,0.002430,0.001416
...,...,...,...,...,...,...
44790,0.150626,0.508841,0.054495,0.141112,0.109686,0.035241
44791,0.438188,0.191874,0.030315,0.196898,0.095205,0.047519
44792,0.169854,0.417889,0.089717,0.130409,0.152856,0.039274
44793,0.291949,0.297526,0.074459,0.200848,0.102419,0.032798


In [47]:
# concat the scores to the music to get a df with the scores
scored_music=pd.concat([music,scores],axis=1) 

In [48]:
# useless column
scored_music.drop(columns=['Unnamed: 0'],inplace=True) 

In [49]:
# we also want a column with just the vectors
vector=music['text'].progress_apply(emotion_score) 

100%|███████████████████████████████████████████████████████████████████████████| 44795/44795 [05:47<00:00, 128.81it/s]


In [50]:
# the vector for each song in the data set
vector 

0        [0.03579174590286366, 0.8689770047957636, 0.01...
1        [0.06429972188393214, 0.6377869365833222, 0.15...
2        [0.5374242825027009, 0.31686309768917464, 0.07...
3        [0.13480472056337753, 0.36512886018708607, 0.2...
4        [0.9674123512465452, 0.00520893792719057, 0.01...
                               ...                        
44790    [0.15062559050424443, 0.5088407635755823, 0.05...
44791    [0.438188434704547, 0.1918744712386625, 0.0303...
44792    [0.16985368845236432, 0.41788928089604205, 0.0...
44793    [0.2919489066741162, 0.2975261464479545, 0.074...
44794    [0.3067180083139707, 0.26479157366799827, 0.08...
Name: text, Length: 44795, dtype: object

In [51]:
# create a column to store the vectors
scored_music['vector']=vector 

In [52]:
# our final dataframe
scored_music.sample(5) 

Unnamed: 0,artist,song,link,text,sadness,joy,love,anger,fear,surprise,vector
44061,Wishbone Ash,Can't Fight Love,/w/wishbone+ash/cant+fight+love_20147313.html,i don't care about no curfew tonight - i just ...,0.135844,0.419008,0.097972,0.125352,0.183206,0.038618,"[0.1358438472028008, 0.41900769291534823, 0.09..."
26683,Face To Face,10-9-8,/f/face+to+face/10+9+8_20149571.html,i'll be your loaded dice you're holding all th...,0.194766,0.147385,0.098991,0.103784,0.2142,0.240875,"[0.19476599608244413, 0.14738528633766504, 0.0..."
23387,Christmas Songs,Born On Christmas Day,/c/christmas+songs/born+on+christmas+day_20767...,it was a cold and dark december night but a st...,0.084143,0.616557,0.102805,0.074441,0.083208,0.038846,"[0.08414342313057617, 0.6165571934188573, 0.10..."
12977,Oasis,Keep The Dream Alive,/o/oasis/keep+the+dream+alive_10196341.html,"four seasons, seconds, flicker, and flash, i'm...",0.474859,0.075932,0.09616,0.081997,0.196037,0.075016,"[0.47485912131492175, 0.07593164065044197, 0.0..."
42525,Underworld,Show Some Emotion,/u/underworld/show+some+emotion_20142185.html,wait you've gone too far who for god's sake wh...,0.099258,0.508163,0.039031,0.281315,0.049071,0.023162,"[0.09925820948350259, 0.5081632207990925, 0.03..."


In [53]:
# import the similarity algorithm for vectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [54]:
sample_user_input='yesterday i watched the lakers game and i lost money on it which was rough but i had fun watching the game anyways which was good'

In [55]:
# write a function that can compute the similarity between two vectors
def song_similarity(u_vec,s_vec):
    # convert the song vector into a numpy array
    song_array = np.array(s_vec) 
    # convert the user vector into a numpy array
    u_array = np.array(u_vec) 
    # reshape the song array so it fits into the cosine similarity function
    shaped_s_array = np.array(song_array).reshape(1,-1) 
    # reshape the user array so it fits into the cosine similarity function
    shaped_u_array = np.array(u_array).reshape(1,-1) 
    # compute the similarity
    similar = cosine_similarity(shaped_s_array,shaped_u_array) 
    # the similarity is inside a list inside a list so we need to index it twice
    return similar[0][0] 


In [56]:
# let's write a function that does the similarity search

def cosine_similarity_search(user_input):
    # the function takes in user input then turns it into a vector
    user_vector = emotion_score(user_input) 
    # it initializes a list called comparison
    comparison = [] 
    # it opens a loop that iterates through the length of the scored music data
    for i in range(len(scored_music)):  
        # the loop initializes a list that hold the similarity to the user vector song and artist
        user_vs_song=[] 
        
        # then the cosine similarity between the user vector and song vector is computed and stored as a variable 
        cos_sim = song_similarity(user_vector,scored_music['vector'][i]) 
        # the song name is stored to a variable
        song_name = scored_music['song'][i] 
        # the artist name is stored to a variable
        artist_name = scored_music['artist'][i] 
        
        # these three variables are appended to the user_vs_song list
        user_vs_song.append(cos_sim)
        user_vs_song.append(song_name)
        user_vs_song.append(artist_name)
        
        # the user_vs_song list is appended to the comparison list to later be sorted
        comparison.append(user_vs_song)

    # once the loop is done executing we convert our comparison list into a data frame so it can be sorted easiliy
    comp_df=pd.DataFrame(comparison,columns=['similarity','track','artist'])
    
    # return the sorted df
    return comp_df.sort_values(by='similarity',ascending=False)#.head(50).sample(5)



In [57]:
cosine_similarity_search(sample_user_input)

Unnamed: 0,similarity,track,artist
27522,0.999721,Entangled,Genesis
41486,0.999568,Think About The Times,Ten Years After
5302,0.999169,What Do I Get?,Everclear
2943,0.999082,My Woman,Chuck Berry
31264,0.999048,Drums,Johnny Cash
...,...,...,...
30424,0.074414,"Love Me, Lovely",Jackson Browne
23277,0.074409,One Sweet Tender Touch,Chris Rea
20747,0.074404,Tender Is The Night,Andy Williams
15078,0.074396,Hot In Here,Rascal Flatts


In [58]:
from sklearn.metrics.pairwise import euclidean_distances

In [59]:
def song_distance(u_vec,s_vec):
    # convert the song vector into a numpy array
    song_array = np.array(s_vec) 
    # convert the user vector into a numpy array
    u_array = np.array(u_vec) 
    # reshape the song array so it fits into the distance function
    shaped_s_array = np.array(song_array).reshape(1,-1) 
    # reshape the user array so it fits into the distance function
    shaped_u_array = np.array(u_array).reshape(1,-1) 
    # compute the distance
    similar =  euclidean_distances(shaped_s_array,shaped_u_array) 
    # the distance is inside a list inside a list so we need to index it twice
    return similar[0][0] 

In [60]:
def euclidean_distance_search(user_input):
    # vectorize the user input
    user_vector = emotion_score(user_input) 
    # initilize list that can compare the distances
    comparison = [] 
    
    for i in range(len(scored_music)):
        # initilize list that stores the distance song and artist
        user_vs_song = [] 
        # compute the distance
        euclid_distance = song_distance(user_vector,scored_music['vector'][i]) 
        # save song name to variable
        song_name = scored_music['song'][i] 
        # save artist name to a variable
        artist_name = scored_music['artist'][i] 
        # append distance
        user_vs_song.append(euclid_distance) 
        # append song
        user_vs_song.append(song_name) 
        # append artist
        user_vs_song.append(artist_name) 
        # throw all that info into the compairson list
        comparison.append(user_vs_song) 
    # make the comparison list into a data frame
    comp_df = pd.DataFrame(comparison, columns=['distance','track','artist']) 
    # return the df sorted by the distance in ascending order
    return comp_df.sort_values(by='distance') 



In [61]:
# use pickle to pickle the variables instead of joblib

In [62]:
import pickle

In [63]:
with open('emotion_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [64]:
with open('vectorizer.pkl','wb') as file:
    pickle.dump(vectorizer,file)

In [65]:
with open('scored_music.pkl','wb') as file:
    pickle.dump(scored_music,file)