In [1]:
import pandas as pd
import nltk
# word tokenize will turn each word in a text string into a token
from nltk.tokenize import word_tokenize 
# punkt is a language aware model that can handle punctuation in text
nltk.download('punkt')
# i don't think there is any punctuation in the data that we are using to train this model but best to be safe
# there are definitely stop words in the data we will use to build the model (words that do not have sentiment)
from nltk.corpus import stopwords
nltk.download('stopwords')
# this weighs the importance of a word based on freqeuency (feature extraction tool)
from sklearn.feature_extraction.text import TfidfVectorizer
# we all know what this is
from sklearn.model_selection import train_test_split
 # this is our model of choice
from sklearn.linear_model import LogisticRegression
# we will use this to see how accurate the model is
from sklearn.metrics import accuracy_score, classification_report   
  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# reading in labeled text data to make the model
emotions=pd.read_csv('emotions.csv') 

In [3]:
# basic data information
emotions.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB


In [4]:
# here's what the data looks like
emotions.head() 

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
# sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [6]:
# we have some really unbalanced data here, this is what we adress first if we want to improve the model
emotions.label.value_counts() 

1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: label, dtype: int64

In [7]:
# i am writing a function that tokenizes text that we can apply to the text column
def tokenize(text):
    return nltk.word_tokenize(text.lower()) 

In [8]:
# applying the tokenizer function to create a tokens column
emotions['tokens']=emotions['text'].apply(tokenize) 

In [9]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [10]:
# we are creating a column of tokens that do not contain any words we do not want to feed the model
emotions['filtered_tokens'] = emotions['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
 # here's what our data looks like now with the new features
emotions.head()

Unnamed: 0,text,label,tokens,filtered_tokens
0,i just feel really helpless and heavy hearted,4,"[i, just, feel, really, helpless, and, heavy, ...","[feel, really, helpless, heavy, hearted]"
1,ive enjoyed being able to slouch about relax a...,0,"[ive, enjoyed, being, able, to, slouch, about,...","[ive, enjoyed, able, slouch, relax, unwind, fr..."
2,i gave up my internship with the dmrg and am f...,4,"[i, gave, up, my, internship, with, the, dmrg,...","[gave, internship, dmrg, feeling, distraught]"
3,i dont know i feel so lost,0,"[i, dont, know, i, feel, so, lost]","[dont, know, feel, lost]"
4,i am a kindergarten teacher and i am thoroughl...,4,"[i, am, a, kindergarten, teacher, and, i, am, ...","[kindergarten, teacher, thoroughly, weary, job..."


In [12]:
# what's happening here is we are creating a column combining the filtered tokens back together into a text string
emotions['text_combined'] = emotions['filtered_tokens'].apply(lambda x: ' '.join(x))

In [13]:
# i mentioned earlier that this is going to make the text readable for the logistic regression  model
# this is going to create vectors from our text that we can use for training
# the vectors are made with term frequency, relative to amount of documents, which in turn tells us something about their importance
vectorizer = TfidfVectorizer()

In [14]:
X = emotions["text_combined"]
y = emotions["label"]

In [15]:
# splitting the data into training data and testing data
# 42 was chosen arbitrarily
# random state accepts any number in range [-2,147,483,648 , 2,147,483,647]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# fit a tfidf vectorizer on the training data then implement it
X_train = vectorizer.fit_transform(X_train)

In [17]:
# Default is 'l2'
# i chose 1000 for max_iter because it was large enough to reach convergence
model = LogisticRegression(penalty='l2', max_iter=1000) 

In [18]:
# fit the model
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [19]:
X_test = vectorizer.transform(X_test)

In [20]:
# basic prediction function call
y_pred = model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)

In [22]:
# did pretty well
accuracy 

0.8938005326167798

In [23]:
# we aren't after the models prediction for the labels
# we are after the probability scores for each label because we want nuanced emotion vectors
# lets do a demo

In [24]:
# here is a string that i am going to write that will contain some things i want to say about my emotional state
hank_feeling = 'i had a stressfull day but i was able to get a lot done which made me proud'

In [25]:
# in order to pull an emotion vector from this string we need to apply the same principles we used to train the model
# we need to tokenize the text
# we need to filter the tokenized text for stopwords
# we need to re join that filtered token into plain text
# we need to extract features using tfidf vectorization
# we need to tell the model to predict probabilites NOT the label
# we're gonna investigate what the probabilities look like

In [26]:
#  recall this key; sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5) emotions and their label

In [27]:
# now let's write a function that does that whole process for us

In [28]:
def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [29]:
emotion_score(hank_feeling) # test drive

[0.01451814599024002,
 0.9553725169077298,
 0.009214358151940431,
 0.009108672106888238,
 0.005062137502834668,
 0.006724169340367127]

In [30]:
# i wanna run this function on an entire data frame of song lyrics
music=pd.read_csv('songs_with_lyrics_Cleaned.csv') 

In [31]:
 # here's what the data frame with music data and song lyrics
music.head()

Unnamed: 0.1,Unnamed: 0,artist,song,link,text
0,0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"look at her face, it's a wonderful face and it..."
1,1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"take it easy with me, please touch me gently l..."
2,2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,i'll never know why i had to go why i had to p...
3,3,ABBA,Bang,/a/abba/bang_20598415.html,making somebody happy is a question of give an...
4,4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,making somebody happy is a question of give an...


In [32]:
# demo for how the song handles user input
emotion_score(' i had a pretty rough day, can you recommend me a sad song?')

[0.7279565979649304,
 0.134221973501646,
 0.03065877030105599,
 0.04333536542173937,
 0.03509448096248684,
 0.0287328118481416]

In [33]:
# write a function that vectorizes the song lyric data with an emotion score dictionary in order to create a table for the lyric scores

In [34]:
def emotion_dictionary(user_input):
    # make tokens
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter the tokens
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # recombine filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # tfidf feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # call the probabilities
    probabilities = model.predict_proba(input_vector) 
    # create the vector
    emotion_vector = probabilities.tolist()[0] 
    # use the vector to create a dictionary with the scores and a key that references the scores
    return {'sadness': emotion_vector[0], 
    'joy': emotion_vector[1],
    'love': emotion_vector[2],
    'anger': emotion_vector[3],
    'fear': emotion_vector[4],
    'surprise': emotion_vector[5] }

In [35]:
# time to make the score table

In [36]:
# import tqdm for progress bar
from tqdm import tqdm 

In [37]:
# call this to get the progress bar
tqdm.pandas() 

In [38]:
# get a series containing the emotion scores for every song
scores=music['text'].progress_apply(emotion_dictionary).apply(pd.Series) 

100%|██████████| 44795/44795 [05:11<00:00, 143.72it/s]


In [39]:
# here is the scores data series that we will concat to our original data frame
scores 

Unnamed: 0,sadness,joy,love,anger,fear,surprise
0,0.036311,0.868506,0.019026,0.033213,0.022543,0.020402
1,0.063244,0.643494,0.147380,0.052924,0.059883,0.033075
2,0.554030,0.301641,0.071583,0.021935,0.027055,0.023757
3,0.137310,0.356770,0.268195,0.144623,0.065731,0.027370
4,0.966621,0.005189,0.017372,0.006792,0.002545,0.001481
...,...,...,...,...,...,...
44790,0.148298,0.511386,0.055229,0.140651,0.109590,0.034846
44791,0.433139,0.196215,0.030965,0.198344,0.094397,0.046940
44792,0.171882,0.412404,0.088898,0.132166,0.154708,0.039942
44793,0.294399,0.297507,0.073960,0.199487,0.102041,0.032606


In [40]:
# concat the scores to the music to get a df with the scores
scored_music=pd.concat([music,scores],axis=1) 

In [41]:
# useless column
scored_music.drop(columns=['Unnamed: 0'],inplace=True) 

In [42]:
# we also want a column with just the vectors
vector=music['text'].progress_apply(emotion_score) 

100%|██████████| 44795/44795 [05:00<00:00, 148.97it/s]


In [43]:
# create a column to store the vectors
scored_music['vector']=vector 

In [44]:
# our final dataframe
scored_music.sample(5) 

Unnamed: 0,artist,song,link,text,sadness,joy,love,anger,fear,surprise,vector
18904,XTC,Burning With Optimism's Flames,/x/xtc/burning+with+optimisms+flames_20147983....,never seen her glowing all that bright she's t...,0.247445,0.20568,0.184827,0.165282,0.134163,0.062602,"[0.2474450943119507, 0.20568043655593823, 0.18..."
36848,O.A.R.,Whatever Happened,/o/oar/whatever+happened_20748690.html,tell me i'm dreaming my feet are stone and my ...,0.214803,0.227782,0.097041,0.289696,0.110919,0.059758,"[0.2148025674851398, 0.2277820059531977, 0.097..."
561,Alison Krauss,Shield Of Faith,/a/alison+krauss/shield+of+faith_20006114.html,sometimes i'm battle weary i forget to use my ...,0.187542,0.437938,0.131063,0.119489,0.101856,0.022111,"[0.18754225051755907, 0.43793829990885985, 0.1..."
11659,Michael Jackson,Cinderella Stay Awhile,/m/michael+jackson/cinderella+stay+awhile_2009...,"cinderella, stay awhile you're the one that's ...",0.093331,0.492024,0.261747,0.0647,0.067276,0.020922,"[0.09333091562654525, 0.492023792453446, 0.261..."
3766,Death,Open Casket,/d/death/open+casket_20293085.html,approach the image filled with fear as the ima...,0.35558,0.089183,0.072271,0.128653,0.238958,0.115356,"[0.35557999324820166, 0.08918267886597295, 0.0..."


In [45]:
# use pickle to pickle the variables instead of joblib

In [47]:
# import pickle
# with open('emotion_model.pkl', 'wb') as file:
    # pickle.dump(model, file)
# with open('vectorizer.pkl','wb') as file:
    # pickle.dump(vectorizer,file)
# with open('scored_music.pkl','wb') as file:
    # pickle.dump(scored_music,file)