In [1]:
import pickle
import nltk
import numpy as np
import pandas as pd

In [2]:
# more dependencies
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# even more dependencies
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [5]:
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [6]:
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [7]:
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

The following cells will be for tinkering with the cosine similarity search algorithm and figuring out how to make it faster.

In [8]:
# we need the emotion score function

def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = emotion_model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

We need to rewrite the song similarity function.  Since we will be computing the similarity with every song vector at once, it will already be shaped properly.
<div></div>
The only vector that will need to be re-shaped is the vectorized user input

In [9]:
# start by creating a vector from a user's input
user_vector = emotion_score("sample user input written for testing")
user_vector

[0.1920253255547959,
 0.4423441895317889,
 0.10115656642592427,
 0.1433433982582535,
 0.08900427340555789,
 0.03212624682367958]

In [10]:
# we want to be able to compute dot products with this vector and every song vector at once
scored_music.vector

0        [0.03579174590286366, 0.8689770047957636, 0.01...
1        [0.06429972188393214, 0.6377869365833222, 0.15...
2        [0.5374242825027009, 0.31686309768917464, 0.07...
3        [0.13480472056337753, 0.36512886018708607, 0.2...
4        [0.9674123512465452, 0.00520893792719057, 0.01...
                               ...                        
44790    [0.15062559050424443, 0.5088407635755823, 0.05...
44791    [0.438188434704547, 0.1918744712386625, 0.0303...
44792    [0.16985368845236432, 0.41788928089604205, 0.0...
44793    [0.2919489066741162, 0.2975261464479545, 0.074...
44794    [0.3067180083139707, 0.26479157366799827, 0.08...
Name: vector, Length: 44795, dtype: object

In [11]:
# all we need are the values
scored_music.vector.values

array([list([0.03579174590286366, 0.8689770047957636, 0.019480216983298226, 0.032564944744232945, 0.023263061565039773, 0.01992302600880187]),
       list([0.06429972188393214, 0.6377869365833222, 0.1520119896560907, 0.0530454841641546, 0.06036502570132249, 0.032490842011177856]),
       list([0.5374242825027009, 0.31686309768917464, 0.07247709323584733, 0.02208633081018713, 0.02742763237694463, 0.02372156338514537]),
       ...,
       list([0.16985368845236432, 0.41788928089604205, 0.08971713182181408, 0.13040927733200505, 0.15285628878492602, 0.039274332712848595]),
       list([0.2919489066741162, 0.2975261464479545, 0.07445915237974543, 0.20084836550918847, 0.10241929294489283, 0.03279813604410254]),
       list([0.3067180083139707, 0.26479157366799827, 0.08410894800858743, 0.2173920374978352, 0.0970364581197081, 0.029952974391900266])],
      dtype=object)

In [12]:
# but we also need them to be in a matrix
song_matrix = np.vstack(scored_music.vector.values)
song_matrix

array([[0.03579175, 0.868977  , 0.01948022, 0.03256494, 0.02326306,
        0.01992303],
       [0.06429972, 0.63778694, 0.15201199, 0.05304548, 0.06036503,
        0.03249084],
       [0.53742428, 0.3168631 , 0.07247709, 0.02208633, 0.02742763,
        0.02372156],
       ...,
       [0.16985369, 0.41788928, 0.08971713, 0.13040928, 0.15285629,
        0.03927433],
       [0.29194891, 0.29752615, 0.07445915, 0.20084837, 0.10241929,
        0.03279814],
       [0.30671801, 0.26479157, 0.08410895, 0.21739204, 0.09703646,
        0.02995297]])

In [13]:
user_vector

[0.1920253255547959,
 0.4423441895317889,
 0.10115656642592427,
 0.1433433982582535,
 0.08900427340555789,
 0.03212624682367958]

In [14]:
user_array = np.array(user_vector)
user_array

array([0.19202533, 0.44234419, 0.10115657, 0.1433434 , 0.08900427,
       0.03212625])

In [15]:
print( 'user array shape: ',user_array.shape)
print('song matrix shape: ',song_matrix.shape)

user array shape:  (6,)
song matrix shape:  (44795, 6)


In order to do the dot product for an mxn with a pxq matrix, n and q must be equal.  This is why we need to reshape the user_array

In [16]:
shaped_user_array = user_array.reshape(1,-1)
print( "user_array shape: ", shaped_user_array.shape)

user_array shape:  (1, 6)


Now we can do dot products which is part of the cosine similarity procedure.

In [17]:
cosine_similarity(shaped_user_array,song_matrix).shape

(1, 44795)

Once we get the cosine similarity array we need to convert it back to a list so it can be stored in a pandas data frame.

In [18]:
cosine_similarity(shaped_user_array,song_matrix)[0].shape

(44795,)

First write the function that computes the song similarity using sklearn cosine similarity function.