In [2]:
# load in the dependencies
import pickle
import nltk
import numpy as np
import pandas as pd

In [3]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# this is going to be a set containing all of the english stop words
stop_words = set(stopwords.words('english')) 

In [6]:
# load in the pickles

In [7]:
# this pickle holds the serialized emotion detection model that i trained
with open('emotion_model.pkl','rb') as file:
    emotion_model = pickle.load(file)

In [8]:
# this is a tfdidf vectorizer that was fit with the training data
with open('vectorizer.pkl','rb') as file:
    vectorizer = pickle.load(file)

In [9]:
# this is a data frame that has a vector column that the search algorithm depends on
with open('scored_music.pkl','rb') as file:
    scored_music = pickle.load(file)

In [10]:
# next i am going to import the functions that allow for searching

In [11]:
def emotion_score(user_input):
    # tokenize the input
    input_tokens = nltk.word_tokenize(user_input.lower()) 
    # filter stopwords out of the input
    filtered_input_tokens = [word for word in input_tokens if word not in stop_words] 
    # re-join the filtered tokens
    input_combined = ' '.join(filtered_input_tokens) 
    # feature extraction
    input_vector = vectorizer.transform([input_combined]) 
    # pull the probabilities from the input
    probabilities = model.predict_proba(input_vector)
    # return the probabilities
    return probabilities.tolist()[0] 

In [12]:
# write a function that can compute the similarity between two vectors
def song_similarity(u_vec,s_vec):
    # convert the song vector into a numpy array
    song_array = np.array(s_vec) 
    # convert the user vector into a numpy array
    u_array = np.array(u_vec) 
    # reshape the song array so it fits into the cosine similarity function
    shaped_s_array = np.array(song_array).reshape(1,-1) 
    # reshape the user array so it fits into the cosine similarity function
    shaped_u_array = np.array(u_array).reshape(1,-1) 
    # compute the similarity
    similar = cosine_similarity(shaped_s_array,shaped_u_array) 
    # the similarity is inside a list inside a list so we need to index it twice
    return similar[0][0] 

In [13]:
# let's write a function that does the similarity search

def cosine_similarity_search(user_input):
    # the function takes in user input then turns it into a vector
    user_vector = emotion_score(user_input) 
    # it initializes a list called comparison
    comparison = [] 
    # it opens a loop that iterates through the length of the scored music data
    for i in range(len(scored_music)):  
        # the loop initializes a list that hold the similarity to the user vector song and artist
        user_vs_song=[] 
        
        # then the cosine similarity between the user vector and song vector is computed and stored as a variable 
        cos_sim = song_similarity(user_vector,scored_music['vector'][i]) 
        # the song name is stored to a variable
        song_name = scored_music['song'][i] 
        # the artist name is stored to a variable
        artist_name = scored_music['artist'][i] 
        
        # these three variables are appended to the user_vs_song list
        user_vs_song.append(cos_sim)
        user_vs_song.append(song_name)
        user_vs_song.append(artist_name)
        
        # the user_vs_song list is appended to the comparison list to later be sorted
        comparison.append(user_vs_song)

    # once the loop is done executing we convert our comparison list into a data frame so it can be sorted easiliy
    comp_df=pd.DataFrame(comparison,columns=['similarity','track','artist'])
    
    # return the sorted df
    return comp_df.sort_values(by='similarity',ascending=False)#.head(50).sample(5)


In [14]:
# now we can play with the search function

In [16]:
cosine_similarity_search("i placed at least five different bets on thursday night football and not a single one hit.  I probably lost at least forty dollars.")

Unnamed: 0,similarity,track,artist
18945,0.999684,Mr Blue,Yazoo
899,0.999659,Here's Your Letter (Toronto Concert),Avril Lavigne
40579,0.999576,I Hate You,Slayer
26563,0.999570,Now That It's Over,Everclear
8770,0.999426,Without A Shot,John Mellencamp
...,...,...,...
12091,0.064902,Impressed,Natalie Imbruglia
38927,0.064463,Funny How Love Is,Queen
21066,0.064453,Funny Girl,Barbra Streisand
17252,0.064452,Funny Thing,Travis
