In [1]:
import pickle
import nltk
import numpy as np
import pandas as pd

In [2]:
# more dependencies
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# even more dependencies
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [5]:
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [6]:
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [7]:
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

The following cells will be for tinkering with the cosine similarity search algorithm and figuring out how to make it faster.

In [8]:
# we need the emotion score function

def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = emotion_model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

We need to rewrite the song similarity function.  Since we will be computing the similarity with every song vector at once, it will already be shaped properly.
<div></div>
The only vector that will need to be re-shaped is the vectorized user input

In [9]:
# start by creating a vector from a user's input
user_vector = emotion_score("sample user input written for testing")
user_vector

[0.1920253255547959,
 0.4423441895317889,
 0.10115656642592427,
 0.1433433982582535,
 0.08900427340555789,
 0.03212624682367958]

In [10]:
# we want to be able to compute dot products with this vector and every song vector at once
scored_music.vector

0        [0.03579174590286366, 0.8689770047957636, 0.01...
1        [0.06429972188393214, 0.6377869365833222, 0.15...
2        [0.5374242825027009, 0.31686309768917464, 0.07...
3        [0.13480472056337753, 0.36512886018708607, 0.2...
4        [0.9674123512465452, 0.00520893792719057, 0.01...
                               ...                        
44790    [0.15062559050424443, 0.5088407635755823, 0.05...
44791    [0.438188434704547, 0.1918744712386625, 0.0303...
44792    [0.16985368845236432, 0.41788928089604205, 0.0...
44793    [0.2919489066741162, 0.2975261464479545, 0.074...
44794    [0.3067180083139707, 0.26479157366799827, 0.08...
Name: vector, Length: 44795, dtype: object

In [11]:
# all we need are the values
scored_music.vector.values

array([list([0.03579174590286366, 0.8689770047957636, 0.019480216983298226, 0.032564944744232945, 0.023263061565039773, 0.01992302600880187]),
       list([0.06429972188393214, 0.6377869365833222, 0.1520119896560907, 0.0530454841641546, 0.06036502570132249, 0.032490842011177856]),
       list([0.5374242825027009, 0.31686309768917464, 0.07247709323584733, 0.02208633081018713, 0.02742763237694463, 0.02372156338514537]),
       ...,
       list([0.16985368845236432, 0.41788928089604205, 0.08971713182181408, 0.13040927733200505, 0.15285628878492602, 0.039274332712848595]),
       list([0.2919489066741162, 0.2975261464479545, 0.07445915237974543, 0.20084836550918847, 0.10241929294489283, 0.03279813604410254]),
       list([0.3067180083139707, 0.26479157366799827, 0.08410894800858743, 0.2173920374978352, 0.0970364581197081, 0.029952974391900266])],
      dtype=object)

In [12]:
# but we also need them to be in a matrix
song_matrix = np.vstack(scored_music.vector.values)
song_matrix

array([[0.03579175, 0.868977  , 0.01948022, 0.03256494, 0.02326306,
        0.01992303],
       [0.06429972, 0.63778694, 0.15201199, 0.05304548, 0.06036503,
        0.03249084],
       [0.53742428, 0.3168631 , 0.07247709, 0.02208633, 0.02742763,
        0.02372156],
       ...,
       [0.16985369, 0.41788928, 0.08971713, 0.13040928, 0.15285629,
        0.03927433],
       [0.29194891, 0.29752615, 0.07445915, 0.20084837, 0.10241929,
        0.03279814],
       [0.30671801, 0.26479157, 0.08410895, 0.21739204, 0.09703646,
        0.02995297]])

In [13]:
user_vector

[0.1920253255547959,
 0.4423441895317889,
 0.10115656642592427,
 0.1433433982582535,
 0.08900427340555789,
 0.03212624682367958]

In [14]:
user_array = np.array(user_vector)
user_array

array([0.19202533, 0.44234419, 0.10115657, 0.1433434 , 0.08900427,
       0.03212625])

In [15]:
print( 'user array shape: ',user_array.shape)
print('song matrix shape: ',song_matrix.shape)

user array shape:  (6,)
song matrix shape:  (44795, 6)


In order to do the dot product for an mxn with a pxq matrix, n and q must be equal.  This is why we need to reshape the user_array

In [16]:
shaped_user_array = user_array.reshape(1,-1)
print(shaped_user_array)
print( "user_array shape: ", shaped_user_array.shape)

[[0.19202533 0.44234419 0.10115657 0.1433434  0.08900427 0.03212625]]
user_array shape:  (1, 6)


Now we can do dot products which is part of the cosine similarity procedure.

In [17]:
cosine_similarity(shaped_user_array,song_matrix).shape

(1, 44795)

Once we get the cosine similarity array we need to convert it back to a list so it can be stored in a pandas data frame.

In [18]:
cosine_similarity(shaped_user_array,song_matrix)

array([[0.8813695 , 0.93407444, 0.78258446, ..., 0.99014323, 0.93328253,
        0.90257712]])

In [19]:
array1 = np.array([[1,2,3,4,5,6],[7,8,9,10,11,12],[13,14,15,16,17,18]])
array2 = np.array([[1,2,3,4,5,6]])

print(array1,array1.shape)
print('')
print(array2,array2.shape)

    


[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]] (3, 6)

[[1 2 3 4 5 6]] (1, 6)


In [20]:
array2.reshape(1,-1).shape

(1, 6)

In [21]:
song_matrix.shape

(44795, 6)

In [22]:
user_vector

[0.1920253255547959,
 0.4423441895317889,
 0.10115656642592427,
 0.1433433982582535,
 0.08900427340555789,
 0.03212624682367958]

In [23]:
cosine_similarity(array2,song_matrix).shape

(1, 44795)

In [24]:
array2

array([[1, 2, 3, 4, 5, 6]])

In [25]:
np.transpose(array1)

array([[ 1,  7, 13],
       [ 2,  8, 14],
       [ 3,  9, 15],
       [ 4, 10, 16],
       [ 5, 11, 17],
       [ 6, 12, 18]])

In [26]:
np.dot(array2,np.transpose(array1))

array([[ 91, 217, 343]])

In [27]:
def cosine_similarity_matrices(A, B):
    # Step 1: Compute the dot product of matrix A with matrix B.T (transpose of B)
    dot_product_matrix = np.dot(A, B.T)
    
    # Step 2: Compute the norms of the rows of matrix A and matrix B
    norms_A = np.linalg.norm(A, axis=1)
    norms_B = np.linalg.norm(B, axis=1)
    
    # Step 3: Compute the cosine similarity matrix
    similarity_matrix = dot_product_matrix / np.outer(norms_A, norms_B)
    
    return similarity_matrix

In [28]:
cosine_similarity_matrices(array2,song_matrix)

array([[0.26455028, 0.3951701 , 0.29248176, ..., 0.58227574, 0.57177039,
        0.57481759]])

In [29]:
cosine_similarity(array2,song_matrix)

array([[0.26455028, 0.3951701 , 0.29248176, ..., 0.58227574, 0.57177039,
        0.57481759]])

In [30]:
# lets break down how cosine similarity is computed vector by vector

#for this example lets use array 1 and array 2

array1 = np.array([[1,2,3,4,5,6],[7,8,9,10,11,12],[13,14,15,16,17,18]])
array2 = np.array([[1,2,3,4,5,6]])

print(array1)
print('')
print(array2)

[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]]

[[1 2 3 4 5 6]]


In [31]:
# the first step in computing the cosine similarity is to take the dot product of the two matrices

# currently, the shapes are not compatible
# we have a (1,6) and a (3,6)

In [32]:
# so in order to take the dot product of the two matrices we will need to take the transpose of array1

array1_t  = np.transpose(array1)

print(array1_t)
print('')
print(array1_t.shape)

[[ 1  7 13]
 [ 2  8 14]
 [ 3  9 15]
 [ 4 10 16]
 [ 5 11 17]
 [ 6 12 18]]

(6, 3)


In [33]:
# now that we have the desired shapes we can take the dot product

dot = np.dot(array2,array1_t)
dot

array([[ 91, 217, 343]])

In [34]:
array1

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17, 18]])

In [35]:
# next we will compute the magnitudes

mag1 = np.linalg.norm(array1,axis=1)
mag2 = np.linalg.norm(array2,axis=1)
print('amg1: ',mag1)
print('')
print('mag2: ',mag2)


amg1:  [ 9.53939201 23.64318084 38.19685851]

mag2:  [9.53939201]


In [36]:
# take the outer product of the matrix
outer_prodcut = np.outer(mag1,mag2)

In [37]:
dot/outer_prodcut

array([[1.        , 2.38461538, 3.76923077],
       [0.40347329, 0.96212862, 1.52078395],
       [0.24974284, 0.59554062, 0.94133841]])

In [39]:
sample_string = "testing to see if my cosine similarity function works or is faster"

In [41]:
# try to write your own cosine similarity function on top of numpy

def cos_sim(v,u):
    # only pass objects if they are (m,n) and (k,n) matrices

    # first take the transpose of vector u so it is compatible for dot product
    ut = np.transpose(u)

    # take the dot product
    dot_product = np.dot(v,ut)

    # compute the magnitudes of each row for each matrix
    v_mag = np.linalg.norm(v,axis=1)
    u_mag = np.linalg.norm(u,axis=1)

    # take the outer product of the two magnitude matrices
    outer_product = np.outer(v_mag,u_mag)

    # the cosine similarity is the dot product over the outer product
    cosine_sim = dot_product/outer_product
    return cosine_sim




In [50]:
def cosine_similarity_search_hank(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix  
    # we can compute all forty five thousand cosine similarities at once instead of one by one if we use a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cosine_sim = cos_sim(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    
    

    
    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cosine_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)

In [56]:
hank_table = cosine_similarity_search_hank(sample_string)

In [52]:
def cosine_similarity_search_sk(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix  
    # we can compute all forty five thousand cosine similarities at once instead of one by one if we use a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cosine_sim = cosine_similarity(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    
    

    
    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cosine_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)

In [57]:
sk_table = cosine_similarity_search_sk(sample_string)

In [54]:
hank_table

Unnamed: 0,similarity,track,artist
29788,0.999423,Real Wild Child,Iggy Pop
37509,0.999315,Won't Be Coming Home (S.I.N.) [Demo],Ozzy Osbourne
6618,0.999262,Rich Man's Spiritual,Gordon Lightfoot
1761,0.999191,Goodbye My Friend,Boney M.
24319,0.999039,Join The Gang,David Bowie
10348,0.998848,Unlucky In Love,Leo Sayer
8607,0.998831,I Guess He'd Rather Be In Colorado,John Denver
35571,0.998807,King Leer,Morrissey
41319,0.99878,Papa Legba,Talking Heads
5005,0.998756,Ain't Nobody's Business If I Do,Eric Clapton


In [55]:
sk_table

Unnamed: 0,similarity,track,artist
29788,0.999423,Real Wild Child,Iggy Pop
37509,0.999315,Won't Be Coming Home (S.I.N.) [Demo],Ozzy Osbourne
6618,0.999262,Rich Man's Spiritual,Gordon Lightfoot
1761,0.999191,Goodbye My Friend,Boney M.
24319,0.999039,Join The Gang,David Bowie
10348,0.998848,Unlucky In Love,Leo Sayer
8607,0.998831,I Guess He'd Rather Be In Colorado,John Denver
35571,0.998807,King Leer,Morrissey
41319,0.99878,Papa Legba,Talking Heads
5005,0.998756,Ain't Nobody's Business If I Do,Eric Clapton
