# Content based Recomendation System, tested on different word embeddings

In [770]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import numpy as np 
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
import json 
import random

songs = pd.read_csv('spotify_songs.csv')
songs.head()

Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,...,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,...,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en
2,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",0,6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,...,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,en
3,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,41,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,...,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,en
4,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,65,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,...,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,en


## Pre-Processing

In [772]:
## removing non english and duplicates
songs = songs.loc[songs['language'] == "en"]
songs = songs.drop_duplicates(subset=['lyrics'])


## isolating lyrics 
lyrics = list(songs["lyrics"])
track_names = list(songs["track_name"])

In [771]:
## initializing lemmatizer
wl = WordNetLemmatizer()
lyrics_clean = []
i = 0

## cleaning lyrics of songs 
for song in lyrics:
    temp = re.sub('[^a-zA-Z]', ' ', song)
    temp = temp.lower()
    temp = temp.split(" ")
    song_clean = ""
    for word in temp:
        ## lemmatize only useful words
        if len(word) > 3 and not word in set(stopwords.words('english')):
            song_clean += wl.lemmatize(word) + " "
            
    ## list that stores all preprocessed lyrics 
    lyrics_clean.append(song_clean)

KeyboardInterrupt: 

## TFIDF Word Embedding

In [523]:
# creating tfidf vector
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
tfidf_sparse_matrix = tfidf_vectorizer.fit_transform(lyrics_clean)

In [532]:
## reducing the dimensions 
svd_dimensions = 100
svd = TruncatedSVD(n_components= svd_dimensions, random_state=42)
reduced_tfidf = svd.fit_transform(tfidf_sparse_matrix)

In [779]:
tfidf_dict = {}
## creating a dict with track name being key and its respective vector being the value 
for i in range(len(lyrics_clean)):
    tfidf_dict[songs.iloc[i]["track_name"]] = reduced_tfidf[i].reshape(1, svd_dimensions)
    if songs.iloc[i]["track_name"] == "Hola Beba":
        print("here", i)

## Doc2Vec Word Embedding

In [536]:
## using Doc2Vec to represent lyrics in document vectors 
vec_size = 100
tagged_lyrics = [TaggedDocument(lyrics.split(), [i]) for i, lyrics in enumerate(lyrics_clean)]
doc2vec_matrix = Doc2Vec(tagged_lyrics, vector_size= vec_size, window=2, min_count=1, workers=4)

In [780]:
doc2vec_dict = {}
## creating a dict with track name being key and its respective vector being the value 
for i in range(len(lyrics_clean)):
    doc2vec_dict[songs.iloc[i]["track_name"]] = doc2vec_matrix[i].reshape(1, vec_size)

## Bert pretrained model Word Embedding

In [540]:
from sentence_transformers import SentenceTransformer

## pretrained BERT model that creates sentence embeddings much faster than classical BERT models
small_bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
document_embeddings = small_bert_model.encode(lyrics_clean)


Some weights of the model checkpoint at /Users/marcoliveau/.cache/torch/sentence_transformers/sbert.net_models_bert-base-nli-mean-tokens/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [781]:
sbert_dict = {}
## creating a dict with track name being key and its respective vector being the value 
for i in range(len(lyrics_clean)):
    sbert_dict[songs.iloc[i]["track_name"]] = document_embeddings[i].reshape(1, sbert_reshaper)

In [544]:
sbert_dict[songs.iloc[0]["track_name"]].shape

(1, 768)

## Recommender code

In [628]:
class SongRecommender:
    def __init__(self, playlist, embeddings_dict):
        self.playlist = playlist
        self.embeddings_dict = embeddings_dict
        
    def finding_avg_vector(self):
        vec_list = []
        ## extracting all vectors
        for song in self.playlist:
            vec_list.append(self.embeddings_dict[song])
        ## adding all vectors to a np array
        main_array = vec_list[0]
        for vec in vec_list[1:]:
            np.concatenate((main_array, vec), axis = 0)
        ## finding mean 
        avg_vec = np.array(main_array.mean(0))
        return avg_vec

    def song_weeder(self, recommended_songs):
        final_recommendation = []
        ## collecting only the songs that are NOT in the OG playlist 
        for song in recommended_songs:
            if song[0] not in playlist:
                final_recommendation.append([song[0], song[1], song[2]])
        return final_recommendation

    def n_most_similar(self, user_vec, n):
        similarity_scores = []
        ##finding all similarities 
        for i in range(len(self.embeddings_dict)):
            similarity_scores.append((i, float(cosine_similarity([user_vec], list(self.embeddings_dict.values())[i]))))
        ## sorting scores in descending order, with its respective position 
        sorted_scores = sorted(similarity_scores, key=lambda tup: tup[1], reverse= True)
        recommended_songs = []
        ## collecting names of songs with highest similarity scores, with its respective score 
        for pos, score in sorted_scores[:n*2]:
            recommended_songs.append([songs.iloc[pos]["track_name"], songs.iloc[pos]["track_artist"], score])
        ## removing any recommended songs that already exist in the playlist
        final_recommendation = self.song_weeder(recommended_songs)
        return final_recommendation[:n]

    def print_recommended(self, recommended_songs, n):
        i = 1
        ## printing message for each recommended song 
        for song in recommended_songs[:n]:
            print("Song recommended number", i,  ": "+ song[0], "by -" , song[1], "with a similarity score of ", song[2])
            print("--------------------------------")
            i +=1
        

    

In [629]:
def recommender(playlist, embedding_dict, embedding_name):
    print("WORD EMBEDDING IN USE:", embedding_name)
    print("-----------------------")
    print("PLAYLIST:", playlist)
    print("-----------------------")
    sr = SongRecommender(playlist, embedding_dict)
    user_vec = sr.finding_avg_vector()
    n = round(len(playlist) * 0.4)
    recommended_songs = sr.n_most_similar(user_vec, n)
    sr.print_recommended(recommended_songs, n)

## Testing Recommender with different Word Embeddings

In [None]:
## Creating random playlist for testing 
i= 123
j= 456
k= 789
l = 987
m = 654
n = 321
playlist = [songs.iloc[i]["track_name"], songs.iloc[j]["track_name"], songs.iloc[k]["track_name"], 
            songs.iloc[l]["track_name"], songs.iloc[m]["track_name"], songs.iloc[n]["track_name"]]

In [None]:
recommender(playlist, tfidf_dict, "TFIDF")

In [612]:
recommender(playlist, doc2vec_dict, "DOC2VEC")

WORD EMBEDDING IN USE: DOC2VEC
-----------------------
PLAYLIST: ['April Showers', 'I Lived - Arty Remix', 'We Start Fires', 'You Are My Heart', 'Anywhere', 'Stairway To Heaven']
-----------------------
Song recommended number 1 : Afterlife by - Greyson Chance with a similarity score of  0.6775115728378296
--------------------------------
Song recommended number 2 : Somebody For Me by - Heavy D & The Boyz with a similarity score of  0.6732325553894043
--------------------------------


In [613]:
recommender(playlist, sbert_dict, "SBERT")

WORD EMBEDDING IN USE: SBERT
-----------------------
PLAYLIST: ['April Showers', 'I Lived - Arty Remix', 'We Start Fires', 'You Are My Heart', 'Anywhere', 'Stairway To Heaven']
-----------------------
Song recommended number 1 : Afterlife by - Greyson Chance with a similarity score of  0.7414441704750061
--------------------------------
Song recommended number 2 : DJ Turn It Up by - Yellow Claw with a similarity score of  0.7410348057746887
--------------------------------


## Preprocessing for evaluation data

In [556]:
## loading data from previous notebooks output
a_file = open("data.json", "r")
output = a_file.read()
res = json.loads(output)

In [774]:
## creating new playlists for evaluation, only containing usable songs and playlists of length greater than 10
count = 0
eval_dict = {}
for playlist, song_list in res.items():
    clean_songs = []
    for song in song_list:
        if song in track_names:
            clean_songs.append(song)
    if len(clean_songs) > 9:
        splitter = len(clean_songs) - round(len(clean_songs) * 0.3)
        input_playlist = clean_songs[:splitter]
        eval_data = clean_songs[splitter:]
        eval_dict[playlist] = [input_playlist, eval_data]
        count += 1        

In [749]:
### function to get playlsit of given length
def n_length_playlists(n):
    final_eval = []
    for playlist, song_list in eval_dict.items():
        if len(song_list[0]) + len(song_list[1]) == n:
            final_eval.append(song_list)
    random.shuffle(final_eval)
    return final_eval

In [753]:
o = n_length_playlists(10)
print(o[:2])

[[["I'm Shipping Up To Boston", 'Sympathy For The Devil', 'Uprising', 'Lightning Crashes', "'Till I Collapse", 'Gimme Shelter', 'Smells Like Teen Spirit'], ['In The Air Tonight - 2015 Remastered', 'Born to Run', 'Bad Moon Rising']], [['I Feel It Coming', 'All I Know', 'Six Feet Under', 'Starboy', 'The Only One', 'Needed Me', 'Secrets'], ['LUV', 'Touch It', 'Die For You']]]


## Evaluation functions

In [755]:
from sklearn.metrics import jaccard_score
import math 

## recommender that returns list of track names recommended
def eval_recommender(playlist, embedding_dict, n):
    ## initializing object
    sr = SongRecommender(playlist, embedding_dict)
    ## finding avg vector
    user_vec = sr.finding_avg_vector()
    ## getting most similar songs, with artist and similarity score
    recommended_songs = sr.n_most_similar(user_vec, n)
    y_pred = []
    ## extracting only the song name
    for val in recommended_songs:
        y_pred.append(val[0])
    return y_pred

## getting average accuracy score of a given word embedding model of 69000 playlists
def evaluate_model(embedding_dict, final_eval):
    print("Starting to evaluate")
    cs_sum = 0
    i = 0
    for tupl in final_eval:
        ## extracting "train and test" data
        input_playlist = tupl[0]
        real = tupl[1]
        ## getting avg vector of ytrue
        sr1 = SongRecommender(real, embedding_dict)
        y_true = sr1.finding_avg_vector()
        ## getting recommended songs in list format
        recommended = eval_recommender(input_playlist, embedding_dict, len(real))
        
        ##getting avg vector of recommended songs
        sr2 = SongRecommender(recommended, embedding_dict)
        y_pred = sr2.finding_avg_vector()
        
        ## comparing avg vectors of true and predicted recommended songs
        y_true = np.reshape(y_true, (1, y_true.shape[0]))
        y_pred = np.reshape(y_pred, (1, y_pred.shape[0]))
        
        cs = float(cosine_similarity(y_true, y_pred))
        print("THIS IS COSINE SIMILARTY", cs)
        print("THIS IS I", i)
        if i == 10:
            cs_avg = cs_sum / i
            return cs_avg
        i+=1
        cs_sum += cs
        
    js_avg = js_sum / len(eval_dict)
    return js_avg

def eval_diff_playlist_sizes(sizes, embedding_dict):
    scores = []
    for size in sizes:
        scores.append(evaluate_model(embedding_dict, size))
    return scores


## Evaluating word embeddings 

## Fine tuning TFIDF and Doc2Vec vector dimensions

## Adding Audio features to recommendation (increasing performance?)

In [None]:
##number of songs not in track_names 140002
##number of songs in track_names 2115060