In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob

In [2]:
df = pd.read_csv('first30.csv')
df.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461
1,0,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04
2,0,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259
3,0,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,0.891,0.714,4.0,-6.055,0.0,0.14,0.202,0.000234,0.0521,0.818,100.972
4,0,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759


In [None]:
dic = dict()
# construct dictionary
for i, row in df.iterrows():
    if row['pid'] in dic:
        dic[row['pid']].append(row['track_uri'])
    else:
        dic[row['pid']] = [row['track_uri']]

In [None]:
df.columns

In [None]:
useful_col = ['pid', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo' ]

In [None]:
# keep song features
df = df[useful_col]
df.head()

In [None]:
df1 = df.groupby('pid').agg({'danceability':['mean', 'max', 'min'], 
                             'energy':['mean', 'max', 'min'],
                             'key':['mean', 'max', 'min'], 
                             'loudness':['mean', 'max', 'min'],
                            'speechiness':['mean', 'max', 'min'],
                             'instrumentalness':['mean', 'max', 'min'],
                             'liveness':['mean', 'max', 'min'], 
                             'valence':['mean', 'max', 'min'], 
                             'tempo':['mean', 'max', 'min']})

In [None]:
# featurize a playlist by using features of songs that are in it
df1.head()

In [None]:
df1.columns

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# create cosine similarity matrix
cos_sim = cosine_similarity(df1)
cos_sim

In [None]:
# create eucliden distance matrix
sc = StandardScaler()
df_sc = sc.fit_transform(df1)
euc_sim = euclidean_distances(df_sc)
euc_sim

In [None]:
def recommend_playlist(base, dic, cos_sim, split_ratio, cos=True):
    res = []
    base_tracks = set(dic[base])
    num_base = len(base_tracks)
    ratio = int(split_ratio * num_base)
    
    
    if cos:
        # get the most similar playlists in terms of cosine similarity
        sim_list = np.flip(np.argsort(cos_sim[base]))[1:]
    else:
        # get the most similar playlists in terms of eucidean similarity
        sim_list = np.argsort(cos_sim[base])[1:]
        
    for p in sim_list:
        p_tracks = set(dic[p])
        diff = list(p_tracks.difference(dic[base][:ratio]))
        cur_diff = num_base - len(res) 

        if cur_diff > 0:
            num_candi = len(diff)

            if num_candi <= cur_diff: 
                res.extend(diff)
            else:
                res.extend(diff[:cur_diff])
                break     
        else:
            break    
    return res

In [None]:
# demo to recommend songs for playlist 0
recommend_playlist(0, dic, cos_sim, 0.8)

In [None]:
def eval_playlist(test_tracks, recommended_tracks):
    # r-precision
    test = set(test_tracks)
    pred = set(recommended_tracks)
    res = list(test & pred)
    return len(res) / len(test)


def recommend_eval(dic_eval, dic, split_ratio, cos_sim, cos=True):  
    res = []
    for base in dic_eval.keys():
        ratio = int(split_ratio*len(dic_eval[base]))
        recommended_tracks = recommend_playlist(base, dic, cos_sim, split_ratio, cos)
        r_precision = eval_playlist(dic_eval[base][ratio:], recommended_tracks)
        res.append(r_precision)
    return res

In [None]:
# loading test data
df1 = pd.read_csv('songs0.csv')

In [None]:
# construct dictionary for test data
dic1 = {}
for i, row in df1.iterrows():
    if row['pid'] in dic1:
        dic1[row['pid']].append(row['track_uri'])
    else:
        dic1[row['pid']] = [row['track_uri']]

# cosine similarity

In [None]:
res = recommend_eval(dic1, dic, 0.8, cos_sim)

In [None]:
res

In [None]:
np.mean(res)

In [None]:
plt.figure(figsize=(10,5))
plt.title('distribution of r-precison using Nearst Neighbor with song features')
plt.xlabel('r-precison')
plt.ylabel('count')
plt.hist(res, bins=80)
plt.show()

In [None]:
res1 = np.take(res, np.nonzero(res)[0])

In [None]:
plt.figure(figsize=(8,6))
plt.title('r-precison among the first 31000 playlists with non-zero r-precisions')
plt.xlabel('r-precison')
plt.ylabel('count')
plt.hist(res1, bins=80)
plt.show()

# euclidean distance

In [None]:
res_euc = recommend_eval(dic1, dic, 0.8, euc_sim, False)

In [None]:
np.mean(res_euc)

In [None]:
plt.figure(figsize=(8,6))
plt.title('r-precison among the first 31000 playlists')
plt.xlabel('r-precison')
plt.ylabel('count')
plt.hist(res_euc, bins=80)
plt.show()

In [None]:
res2 = np.take(res_euc, np.nonzero(res_euc)[0])

In [None]:
plt.figure(figsize=(8,6))
plt.title('r-precison among the first 31000 playlists with non-zero r-precisions')
plt.xlabel('r-precison')
plt.ylabel('count')
plt.hist(res2, bins=80)
plt.show()

In [None]:
np.mean(res2)