In [1]:
# load in the dependencies
import pickle
import nltk
import numpy as np
import pandas as pd

In [2]:
# more dependencies
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# even more dependencies
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [5]:
# load in the pickles

In [6]:
# this pickle holds the serialized emotion detection model that i trained
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [7]:
# this is a tfdidf vectorizer that was fit with the training data
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [8]:
# this is a data frame that has a vector column that the search algorithm depends on
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

In [9]:
# next i am going to import the functions that allow for searching

In [10]:
def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = emotion_model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [None]:
def cosine_similarity_search_x(user_input):
    
    # Convert the user's input into a vector 
    user_vector = emotion_score(user_input)
    
    # Ensure scored_music DataFrame has a column for vectors
    if 'vector' not in scored_music.columns:
        raise ValueError("scored_music DataFrame must have a 'vector' column containing song vectors.")
    
    # Convert the list of vectors from the scored_music DataFrame into a matrix
    song_vectors = np.vstack(scored_music['vector'].values)
    
    # Calculate cosine similarity for the entire set of song vectors at once
    # the 0 index is to turn the array into a list which can then be stored in a data frame
    cos_sim = cosine_similarity(np.array(user_vector).reshape(1,-1), song_vectors)[0]
    
    

    
    
    # Create a DataFrame to store the similarity scores along with song and artist info
    comparison_df = pd.DataFrame({
        'similarity': cos_sim,
        'track': scored_music['song'],
        'artist': scored_music['artist']
    })
    
    # Sort by similarity in descending order
    sorted_comparison_df = comparison_df.sort_values(by='similarity', ascending=False)
    
    # return the top 10
    return sorted_comparison_df.head(10)



In [26]:
cosine_similarity_search_x("i placed at least five different bets on thursday night football and not a single one hit.  I probably lost at least forty dollars.")

Unnamed: 0,similarity,track,artist
18945,0.999684,Mr Blue,Yazoo
899,0.999659,Here's Your Letter (Toronto Concert),Avril Lavigne
40579,0.999576,I Hate You,Slayer
26563,0.99957,Now That It's Over,Everclear
8770,0.999426,Without A Shot,John Mellencamp
23509,0.999286,Wee Wee Hours,Chuck Berry
42487,0.99928,Anodyne,Uncle Tupelo
15958,0.999264,Sleeping Single,Roxette
29526,0.999249,Boudicca,Horrible Histories
40280,0.999225,The Wreckers,Rush
