In [1]:
import pickle
import nltk
import numpy as np
import pandas as pd

In [2]:
# more dependencies
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# even more dependencies
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [5]:
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [6]:
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [7]:
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

The following cells will be for tinkering with the cosine similarity search algorithm and figuring out how to make it faster.

In [8]:
# we need the emotion score function

def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = emotion_model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [19]:
# try to write your own cosine similarity function on top of numpy

def cos_sim(v,u):
    # only pass objects if they are (m,n) and (k,n) matrices

    # first take the transpose of vector u so it is compatible for dot product
    ut = np.transpose(u)

    # take the dot product
    dot_product = np.dot(v,ut)

    # compute the magnitudes of each row for each matrix
    v_mag = np.linalg.norm(v,axis=1)
    u_mag = np.linalg.norm(u,axis=1)

    # take the outer product of the two magnitude matrices
    outer_product = np.outer(v_mag,u_mag)

    # the cosine similarity is the dot product over the outer product
    cosine_sim = dot_product/outer_product
    return cosine_sim




In [20]:
def cosine_similarity_search(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix  
    # we can compute all forty five thousand cosine similarities at once instead of one by one if we use a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cosine_sim = cos_sim(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    

    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cosine_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)

In [31]:
hank_table = cosine_similarity_search("lovin em and leavin em")