In [1]:
import pickle
import nltk
import numpy as np
import pandas as pd

In [2]:
# more dependencies
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# even more dependencies
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [5]:
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [6]:
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [7]:
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

The following cells will be for tinkering with the cosine similarity search algorithm and figuring out how to make it faster.

In [8]:
# we need the emotion score function

def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = emotion_model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [27]:
# lets break down how cosine similarity is computed vector by vector

#for this example lets use array 1 and array 2

array1 = np.array([[1,2,3,4,5,6]])
array2 = np.array([[1,2,3,4,5,6],[7,8,9,10,11,12],[13,14,15,16,17,18]])


print(array1)
print('')
print(array2)

[[1 2 3 4 5 6]]

[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]
 [13 14 15 16 17 18]]


In [28]:
# the first step in computing the cosine similarity is to take the dot product of the two matrices

# currently, the shapes are not compatible
# we have a (1,6) and a (3,6)

# we need shapes (1,6) and (6,3)

In [29]:
# so in order to take the dot product of the two matrices we will need to take the transpose of array2

array2_t  = np.transpose(array2)

print(array2_t)
print('')
print(array2_t.shape)

[[ 1  7 13]
 [ 2  8 14]
 [ 3  9 15]
 [ 4 10 16]
 [ 5 11 17]
 [ 6 12 18]]

(6, 3)


In [30]:
# now that we have the desired shapes we can take the dot product

dot = np.dot(array1,array2_t)
dot

array([[ 91, 217, 343]])

In [31]:
array1

array([[1, 2, 3, 4, 5, 6]])

In [37]:
# next we will compute the magnitudes

mag1 = np.linalg.norm(array1,axis=1)
mag2 = np.linalg.norm(array2,axis=1)
print('mag1: ',mag1)
print('')
print('mag2: ',mag2)


mag1:  [9.53939201]

mag2:  [ 9.53939201 23.64318084 38.19685851]


In [33]:
# take the outer product of the matrix
outer_prodcut = np.outer(mag1,mag2)

In [34]:
dot/outer_prodcut

array([[1.        , 0.96212862, 0.94133841]])

In [35]:
sample_string = "testing to see if my cosine similarity function works or is faster"

In [38]:
# try to write your own cosine similarity function on top of numpy

def cos_sim(v,u):
    # only pass objects if they are (m,n) and (k,n) matrices

    # first take the transpose of vector u so it is compatible for dot product
    ut = np.transpose(u)

    # take the dot product
    dot_product = np.dot(v,ut)

    # compute the magnitudes of each row for each matrix
    v_mag = np.linalg.norm(v,axis=1)
    u_mag = np.linalg.norm(u,axis=1)

    # take the outer product of the two magnitude matrices
    outer_product = np.outer(v_mag,u_mag)

    # the cosine similarity is the dot product over the outer product
    cosine_sim = dot_product/outer_product
    return cosine_sim




In [39]:
def cosine_similarity_search(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix  
    # we can compute all forty five thousand cosine similarities at once instead of one by one if we use a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cosine_sim = cos_sim(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    
    

    
    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cosine_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)

In [40]:
hank_table = cosine_similarity_search(sample_string)

In [41]:
def cosine_similarity_search_sk(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix  
    # we can compute all forty five thousand cosine similarities at once instead of one by one if we use a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cosine_sim = cosine_similarity(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    
    

    
    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cosine_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)

In [42]:
sk_table = cosine_similarity_search_sk(sample_string)

In [43]:
hank_table

Unnamed: 0,similarity,track,artist
29788,0.999423,Real Wild Child,Iggy Pop
37509,0.999315,Won't Be Coming Home (S.I.N.) [Demo],Ozzy Osbourne
6618,0.999262,Rich Man's Spiritual,Gordon Lightfoot
1761,0.999191,Goodbye My Friend,Boney M.
24319,0.999039,Join The Gang,David Bowie
10348,0.998848,Unlucky In Love,Leo Sayer
8607,0.998831,I Guess He'd Rather Be In Colorado,John Denver
35571,0.998807,King Leer,Morrissey
41319,0.99878,Papa Legba,Talking Heads
5005,0.998756,Ain't Nobody's Business If I Do,Eric Clapton


In [44]:
sk_table

Unnamed: 0,similarity,track,artist
29788,0.999423,Real Wild Child,Iggy Pop
37509,0.999315,Won't Be Coming Home (S.I.N.) [Demo],Ozzy Osbourne
6618,0.999262,Rich Man's Spiritual,Gordon Lightfoot
1761,0.999191,Goodbye My Friend,Boney M.
24319,0.999039,Join The Gang,David Bowie
10348,0.998848,Unlucky In Love,Leo Sayer
8607,0.998831,I Guess He'd Rather Be In Colorado,John Denver
35571,0.998807,King Leer,Morrissey
41319,0.99878,Papa Legba,Talking Heads
5005,0.998756,Ain't Nobody's Business If I Do,Eric Clapton


In [48]:
# Tkinter GUI
import tkinter as tk
from tkinter import messagebox
def search_similar_songs():
    # Get the user input from the Entry widget
    user_input = entry.get()
    
    if not user_input:
        messagebox.showwarning("Input Error", "Please enter some text.")
        return
    
    
    
    
    # Perform the search
    result = cosine_similarity_search_hank(user_input)
    
    # Clear the previous results in the Listbox
    listbox.delete(0, tk.END)
    
    # Add the top 10 results to the Listbox
    for _, row in result.iterrows():
        listbox.insert(tk.END, f"Track: {row['track']} | Artist: {row['artist']} | Similarity: {row['similarity']:.4f}")

# Create the main window
root = tk.Tk()
root.title("Emotion Engine")

# Create a label for user instructions
label = tk.Label(root, text="Enter a description or emotion:")
label.pack(pady=10)

# Create an entry widget to take user input
entry = tk.Entry(root, width=50)
entry.pack(pady=10)

# Create a button to trigger the search
search_button = tk.Button(root, text="Find Similar Songs", command=search_similar_songs)
search_button.pack(pady=10)

# Create a Listbox to display the results
listbox = tk.Listbox(root, width=60, height=10)
listbox.pack(pady=10)

# Run the GUI main loop
root.mainloop()