In [1]:
import pandas as pd
import nltk

In [2]:
from nltk.tokenize import word_tokenize # word tokenize will turn each word in a text string into a token

In [3]:
nltk.download('punkt') # punkt is a language aware model that can handle punctuation in text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# i don't think there is any punctuation in the data that we are using to train this model but best to be safe

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords') # there are definitely stop words in the data we will use to build the model (words that do not have sentiment)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer # this weighs the importance of a word based on freqeuency (feature extraction tool)

In [7]:
from sklearn.model_selection import train_test_split # we all know what this is

In [8]:
from sklearn.linear_model import LogisticRegression # this is our model of choice

In [9]:
from sklearn.metrics import accuracy_score, classification_report # we will use this to see how accurate the model is

In [10]:
emotions=pd.read_csv('emotions.csv') # reading in labeled text data to make the model

In [11]:
emotions.info() # basic data information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [12]:
emotions.head() # here's what the data looks like

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [13]:
# sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [14]:
emotions.label.value_counts() # we have some really unbalanced data here, this is what we adress first if we want to improve the model

label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64

In [15]:
def tokenize(text):
    return nltk.word_tokenize(text.lower()) # i am writing a function that tokenizes text that we can apply to the text column

In [16]:
emotions['tokens']=emotions['text'].apply(tokenize) # applying the tokenizer function to create a tokens column

In [17]:
stop_words = set(stopwords.words('english')) # this is going to be a set containing all of the english stop words

In [18]:
emotions['filtered_tokens'] = emotions['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
# we are creating a column of tokens that do not contain any words we do not want to feed the model

In [19]:
emotions.head() # here's what our data looks like now with the new features

Unnamed: 0,text,label,tokens,filtered_tokens
0,i just feel really helpless and heavy hearted,4,"[i, just, feel, really, helpless, and, heavy, ...","[feel, really, helpless, heavy, hearted]"
1,ive enjoyed being able to slouch about relax a...,0,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoyed, able, slouch, relax, unwind, fr..."
2,i gave up my internship with the dmrg and am f...,4,"[i, gave, up, my, internship, with, the, dmrg,...","[gave, internship, dmrg, feeling, distraught]"
3,i dont know i feel so lost,0,"[i, dont, know, i, feel, so, lost]","[dont, know, feel, lost]"
4,i am a kindergarten teacher and i am thoroughl...,4,"[i, am, a, kindergarten, teacher, and, i, am, ...","[kindergarten, teacher, thoroughly, weary, job..."


In [20]:
emotions['text_combined'] = emotions['filtered_tokens'].apply(lambda x: ' '.join(x))

# what's happening here is we are creating a column combining the filtered tokens back together into a text string

In [21]:
vectorizer = TfidfVectorizer()
# i mentioned earlier that this is going to make the text readable for the logistic regression  model
# this is going to create vectors from our text that we can use for training
# the vectors are made with term frequency, relative to amount of documents, which in turn tells us something about their importance

In [22]:
X = vectorizer.fit_transform(emotions['text_combined']) # vectorize the text

In [23]:
y=emotions['label']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# splitting the data into training data and testing data
# 42 was chosen arbitrarily
# random state accepts any number in range [-2,147,483,648 , 2,147,483,647]

In [25]:
model = LogisticRegression(penalty='l2', max_iter=200)  # Default is 'l2'
# i chose 200 for max_iter because it was large enough to reach convergence
# if you do not set the max_iter to 200, the default, 100 is not enough and you will get an error

In [26]:
model.fit(X_train, y_train) # fit the model

In [27]:
y_pred = model.predict(X_test) # basic prediction function call

In [28]:
accuracy = accuracy_score(y_test, y_pred)

In [29]:
accuracy # did pretty well

0.8939924665914926

In [30]:
# we aren't after the models prediction for the labels
# we are after the probability scores for each label because we want nuanced emotion vectors
# lets do a demo

In [31]:
# here is a string that i am going to write that will contain some things i want to say about my emotional state
hank_feeling = 'i had a stressfull day but i was able to get a lot done which made me proud'

In [32]:
# in order to pull an emotion vector from this string we need to apply the same principles we used to train the model
# we need to tokenize the text
# we need to filter the tokenized text for stopwords
# we need to re join that filtered token into plain text
# we need to extract features using tfidf vectorization
# we need to tell the model to predict probabilites NOT the label
# we're gonna investigate what the probabilities look like

In [33]:
#  recall this key; sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [34]:
# now let's write a function that does that whole process for us

In [35]:
def emotion_score(user_input):
    
    input_tokens = nltk.word_tokenize(user_input.lower()) # tokenize the input
    
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] # filter stopwords out of the input
    
    input_combined = ' '.join(filtered_input_tokens) # re-join the filtered tokens
    
    input_vector = vectorizer.transform([input_combined]) # feature extraction
    
    probabilities = model.predict_proba(input_vector) # pull the probabilities from the input
    
    return probabilities.tolist()[0] # return the probabilities

In [36]:
emotion_score(hank_feeling) # test drive

[0.015336456086055822,
 0.9519273437595572,
 0.010170432898996905,
 0.009528920934266534,
 0.005507372710613149,
 0.007529473610510535]

In [37]:
music=pd.read_csv('songs_with_lyrics_Cleaned.csv') # i wanna run this function on an entire data frame of song lyrics

In [38]:
music.head() # here's what the data frame with music data and song lyrics

Unnamed: 0.1,Unnamed: 0,artist,song,link,text
0,0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face and it..."
1,1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please touch me gently l..."
2,2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go why i had to p...
3,3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...


In [39]:
# demo for how the song handles user input
emotion_score(' i had a pretty rough day, can you recommend me a sad song?')

[0.734118761988165,
 0.132520592501066,
 0.029530023819752265,
 0.040496651943053505,
 0.034728021670850914,
 0.028605948077112395]

In [40]:
# write a function that vectorizes the song lyric data with an emotion score dictionary in order to create a table for the lyric scores

In [41]:
def emotion_dictionary(user_input):
    input_tokens = nltk.word_tokenize(user_input.lower()) # make tokens
    
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] # filter the tokens
    
    input_combined = ' '.join(filtered_input_tokens) # recombine filtered tokens
    
    input_vector = vectorizer.transform([input_combined]) # tfidf feature extraction
    
    probabilities = model.predict_proba(input_vector) # call the probabilities
    
    emotion_vector = probabilities.tolist()[0] # create the vector
    
    return {'sadness': emotion_vector[0], # use the vector to create a dictionary with the scores and a key that references the scores
    'joy': emotion_vector[1],
    'love': emotion_vector[2],
    'anger': emotion_vector[3],
    'fear': emotion_vector[4],
    'surprise': emotion_vector[5] }

In [42]:
# time to make the score table

In [43]:
from tqdm import tqdm # import tqdm for progress bar

In [44]:
tqdm.pandas() # call this to get the progress bar

In [45]:
 scores=music['text'].progress_apply(emotion_dictionary).apply(pd.Series) # get a series containing the emotion scores for every song

100%|███████████████████████████████████████████████████████████████████████████| 44795/44795 [05:38<00:00, 132.47it/s]


In [46]:
scores # here is the scores data series that we will concat to our original data frame

Unnamed: 0,sadness,joy,love,anger,fear,surprise
0,0.037772,0.865948,0.020204,0.032962,0.023208,0.019906
1,0.066016,0.637793,0.148777,0.056284,0.058281,0.032849
2,0.557590,0.304692,0.064213,0.023887,0.027135,0.022482
3,0.136736,0.362437,0.262408,0.145773,0.065213,0.027435
4,0.967274,0.005258,0.016713,0.006844,0.002461,0.001449
...,...,...,...,...,...,...
44790,0.152381,0.504729,0.055719,0.141408,0.109148,0.036615
44791,0.426336,0.203087,0.030651,0.199676,0.091436,0.048814
44792,0.164597,0.422916,0.090274,0.126768,0.155709,0.039737
44793,0.297446,0.294825,0.074738,0.199254,0.100945,0.032792


In [47]:
scored_music=pd.concat([music,scores],axis=1) # concat the scores to the music to get a df with the scores

In [48]:
scored_music.drop(columns=['Unnamed: 0'],inplace=True) # useless column

In [49]:
vector=music['text'].progress_apply(emotion_score) # we also want a column with just the vectors

100%|███████████████████████████████████████████████████████████████████████████| 44795/44795 [05:25<00:00, 137.73it/s]


In [50]:
vector # the vector for each song in the data set

0        [0.037771861392196945, 0.865948419112691, 0.02...
1        [0.0660160140254407, 0.6377930766113133, 0.148...
2        [0.5575900549063024, 0.3046922826265263, 0.064...
3        [0.13673553505589905, 0.3624368879213182, 0.26...
4        [0.9672743610382378, 0.0052580633129720306, 0....
                               ...                        
44790    [0.15238124763496208, 0.5047291126308601, 0.05...
44791    [0.4263364301563252, 0.2030869043838914, 0.030...
44792    [0.1645965162351848, 0.4229155166796501, 0.090...
44793    [0.2974464252724677, 0.2948254435155049, 0.074...
44794    [0.30711703875536295, 0.26534945267114374, 0.0...
Name: text, Length: 44795, dtype: object

In [51]:
scored_music['vector']=vector # create a column to store the vectors

In [52]:
scored_music.sample(5) # our final dataframe

Unnamed: 0,artist,song,link,text,sadness,joy,love,anger,fear,surprise,vector
38676,Primus,Too Many Puppies,/p/primus/too+many+puppies_20110905.html,too many puppies are being shot in the dark. t...,0.218101,0.278747,0.086362,0.173436,0.192426,0.050927,"[0.21810087025301583, 0.2787474023400775, 0.08..."
29401,HIM,Kiss The Void,/h/him/kiss+the+void_21057858.html,i'm losin' power and i don't know why? not rea...,0.23657,0.321162,0.068128,0.162477,0.157733,0.05393,"[0.23657001947113443, 0.32116217340875614, 0.0..."
41807,Tom Lehrer,"Fight Fiercely, Harvard!",/t/tom+lehrer/fight+fiercely+harvard_20138406....,now we come to that peculiar bit of americana ...,0.135705,0.213927,0.094511,0.348385,0.165611,0.04186,"[0.13570486582027735, 0.21392683266646487, 0.0..."
14941,Ramones,Go Home Ann,/r/ramones/go+home+ann_20604294.html,go home ann go home ann go home ann ann go hom...,0.187602,0.197193,0.164471,0.165206,0.121952,0.163575,"[0.1876022639855287, 0.1971932164138149, 0.164..."
29607,Howard Jones,Roll Right Up,/h/howard+jones/roll+right+up_20066233.html,will the moon keep shining on can you stop the...,0.174877,0.533135,0.110154,0.057986,0.084589,0.039259,"[0.17487678408736057, 0.5331352128682257, 0.11..."


In [53]:
# import the similarity algorithm for vectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [54]:
sample_user_input='yesterday i watched the lakers game and i lost money on it which was rough but i had fun watching the game anyways which was good'

In [55]:
# write a function that can compute the similarity between two vectors
def song_similarity(u_vec,s_vec):
    
    song_array = np.array(s_vec) # convert the song vector into a numpy array
    
    u_array = np.array(u_vec) # convert the user vector into a numpy array
    
    shaped_s_array = np.array(song_array).reshape(1,-1) # reshape the song array so it fits into the cosine similarity function
    
    shaped_u_array = np.array(u_array).reshape(1,-1) # reshape the user array so it fits into the cosine similarity function
    
    similar = cosine_similarity(shaped_s_array,shaped_u_array) # compute the similarity
    
    return similar[0][0] # the similarity is inside a list inside a list so we need to index it twice


In [56]:
# let's write a function that does the similarity search

def cosine_similarity_search(user_input):
    
    user_vector = emotion_score(user_input) # the function takes in user input then turns it into a vector
    
    comparison = [] # it initializes a list called comparison
    
    for i in range(len(scored_music)):  # it opens a loop that iterates through the length of the scored music data
        
        user_vs_song=[] # the loop initializes a list that hold the similarity to the user vector song and artist
        
        # then the cosine similarity between the user vector and song vector is computed and stored as a variable 
        cos_sim = song_similarity(user_vector,scored_music['vector'][i]) 
        
        song_name = scored_music['song'][i] # the song name is stored to a variable
        
        artist_name = scored_music['artist'][i] # the artist name is stored to a variable
        
        # these three variables are appended to the user_vs_song list
        user_vs_song.append(cos_sim)
        user_vs_song.append(song_name)
        user_vs_song.append(artist_name)
        
        # the user_vs_song list is appended to the comparison list to later be sorted
        comparison.append(user_vs_song)

    # once the loop is done executing we convert our comparison list into a data frame so it can be sorted easiliy
    comp_df=pd.DataFrame(comparison,columns=['similarity','track','artist'])
    
    # return the sorted df
    return comp_df.sort_values(by='similarity',ascending=False)#.head(50).sample(5)



In [57]:
cosine_similarity_search(sample_user_input)

Unnamed: 0,similarity,track,artist
31264,0.999245,Drums,Johnny Cash
38521,0.999129,Broken Thing,Point Of Grace
4167,0.998880,Breakaway,Donna Summer
27522,0.998843,Entangled,Genesis
2943,0.998817,My Woman,Chuck Berry
...,...,...,...
30424,0.074030,"Love Me, Lovely",Jackson Browne
23277,0.074018,One Sweet Tender Touch,Chris Rea
20747,0.074017,Tender Is The Night,Andy Williams
15078,0.074007,Hot In Here,Rascal Flatts


In [58]:
from sklearn.metrics.pairwise import euclidean_distances

In [59]:
def song_distance(u_vec,s_vec):
    song_array = np.array(s_vec) # convert the song vector into a numpy array
    
    u_array = np.array(u_vec) # convert the user vector into a numpy array
    
    shaped_s_array = np.array(song_array).reshape(1,-1) # reshape the song array so it fits into the distance function
    
    shaped_u_array = np.array(u_array).reshape(1,-1) # reshape the user array so it fits into the distance function
    
    similar =  euclidean_distances(shaped_s_array,shaped_u_array) # compute the distance
    
    return similar[0][0] # the distance is inside a list inside a list so we need to index it twice

In [60]:
def euclidean_distance_search(user_input):
    user_vector = emotion_score(user_input) # vectorize the user input
    
    comparison = [] # initilize list that can compare the distances
    
    for i in range(len(scored_music)):
        
        user_vs_song = [] # initilize list that stores the distance song and artist
        
        euclid_distance = song_distance(user_vector,scored_music['vector'][i]) # compute the distance
        
        song_name = scored_music['song'][i] # save song name to variable
        
        artist_name = scored_music['artist'][i] # save artist name to a variable
        
        user_vs_song.append(euclid_distance) # append distance
        
        user_vs_song.append(song_name) # append song
        
        user_vs_song.append(artist_name) # append artist
        
        comparison.append(user_vs_song) # throw all that info into the compairson list

    comp_df = pd.DataFrame(comparison, columns=['distance','track','artist']) # make the comparison list into a data frame
    return comp_df.sort_values(by='distance') # return the df sorted by the distance in ascending order



In [61]:
euclidean_distance_search(sample_user_input)

Unnamed: 0,distance,track,artist
31264,0.023243,Drums,Johnny Cash
38521,0.025997,Broken Thing,Point Of Grace
28583,0.029879,Jabberstroker,Guided By Voices
4797,0.029892,Freestyle 2,Eminem
5302,0.029977,What Do I Get?,Everclear
...,...,...,...
30424,1.122803,"Love Me, Lovely",Jackson Browne
20747,1.122827,Tender Is The Night,Andy Williams
23277,1.122828,One Sweet Tender Touch,Chris Rea
15078,1.122873,Hot In Here,Rascal Flatts


In [62]:
emotion_score(sample_user_input)

[0.5396963788057415,
 0.19285173072893422,
 0.04366610469701906,
 0.09854388358690694,
 0.07520161452573766,
 0.050040287655660556]