In [9]:
import pandas as pd
import numpy as np
import os
import datetime
import math
import json
import torch
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from spotlight.interactions import Interactions
from spotlight.factorization.implicit import ImplicitFactorizationModel

from tqdm import tqdm

In [188]:
data_path = path = '/Users/haoxinli/DocumentsLocal/Spotify/mpd.v1/data/'
playlist_fn = os.listdir(data_path)

## Preparing data

CF_baseline_Train = []
CF_baseline_Val = []

counter = 0
for fn_index in range(100):
    with open(data_path+playlist_fn[fn_index]) as f:
        data = json.load(f)

    playlists = data['playlists']

    for playlist in playlists:
        pid = playlist['pid']
        for song in playlist['tracks']:
            track_uri = song['track_uri'].split(':')[2]
            if counter < 80:
                CF_baseline_Train.append([pid,track_uri])
            else:
                CF_baseline_Val.append([pid,track_uri])
    counter += 1

with open("CF_baseline_Train_80000.json", "w") as f:
    data_json = json.dump(CF_baseline_Train,f)
    
with open("CF_baseline_Val_20000.json", "w") as f:
    data_json = json.dump(CF_baseline_Val,f)

In [160]:
with open('CF_baseline_Train_40000.json', 'r') as f:
    data = json.load(f)

users = [user for user, song in data]
item = [song for user, song in data]

## Assign id to each playlist title

song_per_user = {}
for user in users:
    if user not in song_per_user.keys():
        song_per_user[user] = 1
    else:
        song_per_user[user] += 1
# print(len(song_per_user))



pids = {}
num = 0
for user in song_per_user.keys():
    pids[user] = num
    num += 1
# print(len(pids))
    
    
users_transform = []
for user in users:
    users_transform.append(pids[user])
# print(len(users_transform))
    
    
## Assign id to each track id
# appearance of each track
count = {}
for i in item:
    if i not in count.keys():
        count[i] = 1
    else:
        count[i] = count[i] + 1
# print(len(count))
        
        
item_id = {}
id = 0
for i in count.keys():
    item_id[i] = id
    id = id+1
# print(len(item_id))


item_transformed = []
for i in item:
    item_transformed.append(item_id[i])
# print(len(item_transformed))
    

ratings = np.ones(len(item_transformed))

# prepare the interaction matrix
# data = Interactions(np.array(users_transform), np.array(item_transformed), ratings)

In [181]:
with open('CF_baseline_Train_40000.json', 'r') as f:
    data = json.load(f)

## Basic

In [189]:
model = ImplicitFactorizationModel(n_iter = 5)
model.fit(data, verbose = 1)
torch.save(model, 'baseline_model_40000')

In [5]:
model = torch.load('baseline_model_40000')

In [155]:
with open('CF_baseline_Val_10000.json', 'r') as f:
    data = json.load(f)
    
validation = []

playlist_num = data[0][0]
songs = []
for item in data:
    playlist_num_new = item[0]
    if playlist_num_new == playlist_num:
        songs.append(item[1])
        playlist_num = playlist_num_new
    else:
        validation.append(songs)
        songs = []
        songs.append(item[1])
        playlist_num = playlist_num_new


In [68]:
# prepare validation x and y
X_val = []
y_val = []
for i in range(len(validation)):
    playlist = validation[i]
    x, y = train_test_split(playlist, train_size = 0.7)
    X_val.append(x)
    y_val.append(y)
with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "w") as f:
    data_json = json.dump(X_val,f)
with open("/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json", "w") as f:
    data_json = json.dump(y_val,f)


In [94]:
# load top 500 tracks
top_500 = pd.read_csv('/Users/haoxinli/DocumentsLocal/Spotify/top_500.csv').iloc[:,1].values

In [111]:
top_100 = {}
for i in range(100):
    track_uri = top_500[i].split(':')[2]
    top_100[track_uri] = i+1

In [120]:
scores = []
users = np.array(range(40000))

with open('/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json', 'r') as f:
    validation = json.load(f)

length = 1000

rec = []

for i in tqdm(range(length)):
    playlist=validation[i]
    a = []
    for item in playlist:
        try:
            a.append(item_id[item])
        except:
            pass
    if len(a) > 1:
    
        dic = {}
        score = np.array([0] * len(model.predict(1)))
        sum_w = 0

        new_user = np.array(a)
        random.shuffle(users)
        users = users[0:1000]
        for i in users:
            s = model.predict(i, np.array(new_user))
            cos_sim = cosine_similarity(np.array([s]),np.array([new_user]))
            sum_cos = sum_cos + cos_sim
            score = cos_sim * model.predict(i) + score
        score = score / sum_cos

        for index in np.argsort(score)[-1:-101:-1]:
            dic[id_to_track[index]] = score[index]
        rec.append(dic)

    else:
        rec.append(top_100)

            
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'w') as f:
    json.dump(rec, f)

In [121]:
## Evaluation
with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json', 'r') as f:
    y_validation = json.load(f)
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:
    recs = json.load(f)

In [115]:
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:
    recommendation = json.load(f)
    
R_precision_scores = []
NDCG_scores = []

with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json') as json_file: 
    val_Y = json.load(json_file)  
for i in range(500):
    rec = recommendation[i][:100]
    Y = val_Y[i]
    R_precision_scores.append(R_precision(rec,Y))
    NDCG_scores.append(NDCG(rec,Y))

In [116]:
def R_precision(rec, Y):
    return len(intersection(recommendation, Y))/len(Y)

def NDCG(rec, Y):
    IDCG = 0
    for i in range(0,len(Y)):
        if i == 0: IDCG += 1
        else: IDCG += 1/math.log((i+2),2)
    DCG = 0
    for i in range(0,len(rec)):
        if i == 0 and rec[i] in Y: DCG += 1
        elif i > 0 and rec[i] in Y: DCG += 1/math.log((i+2),2)     
    return DCG/IDCG


In [117]:
np.mean(R_precision_scores)

0.03394676679529621

In [107]:
np.mean(NDCG_scores)

0.013247911605909105

## Advanced

In [22]:
audio_features_norm = audio_features_norm.iloc[:,1:]

In [23]:
audio_features_values = audio_features_norm.iloc[:,:-1].values

In [25]:
trackid = audio_features_norm['track_id']

0         spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1         spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2         spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3         spotify:track:1AWQoqb9bSvzTjaLralEkT
4         spotify:track:1lzr43nnXAijIGYnCT8M8H
                          ...                 
681782    spotify:track:3xLmarzSroQuXbTK44UXhD
681783    spotify:track:3ryw10oCE4NhNbanwrzurQ
681784    spotify:track:1kUoHfeBjoChgOPeBjFELn
681785    spotify:track:37IrFeTPLJs1IKxoBDuN8b
681786    spotify:track:6v0CoPvaaJqWo67GTSPBc1
Name: track_id, Length: 681787, dtype: object

In [30]:
## CF + content
def hybrid(recommendation, validation):
    
    hybrid_rec = []
    for i in tqdm(range(len(recommendation))):
#         print(hybrid_rec)
        dictionary = {}
        rec_playlist=recommendation[i].copy()
        test_val = validation[i].copy()

        for song in rec_playlist[:100]:
            score = 0
            song_pos = np.where(trackid == 'spotify:track:' + song)[0]
            for truth_song in test_val:
                truth_song_pos = np.where(trackid == 'spotify:track:' + truth_song)[0]
                try:
                    score = score + cosine_similarity(audio_features_values[truth_song_pos], audio_features_values[song_pos])[0][0]
                except:
                    pass
            dictionary[song] = score
            dictionary = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}
        hybrid_rec.append(dictionary)
    return hybrid_rec


In [186]:
with open('/Users/haoxinli/DocumentsLocal/Spotify/CF_recommended_song.json', 'r') as f:
    recommendation = json.load(f)
with open("/Users/haoxinli/DocumentsLocal/Spotify/X_validation.json", "r") as f:
    validation = json.load(f)

hybrid_rec = hybrid(recommendation, validation[:1000])
# with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'w') as f:
#     json.dump(hybrid_rec, f)

In [111]:
with open('/Users/haoxinli/DocumentsLocal/Spotify/hybrid_rec.json', 'r') as f:
    hybrid_rec = json.load(f)

In [185]:
# evaluation
hybrid_recommendation = [list(single_score.keys())[:100] for single_score in hybrid_rec]
R_precision_scores = []
NDCG_scores = []

with open('/Users/haoxinli/DocumentsLocal/Spotify/y_validation.json') as json_file: 
    val_Y = json.load(json_file)  
for i in range(1000):
    rec = hybrid_recommendation[i][:100]
    Y = val_Y[i]
    R_precision_scores.append(R_precision(rec,Y))
    NDCG_scores.append(NDCG(rec,Y))

print(np.mean(R_precision_scores))
print(np.mean(NDCG_scores))

0.033521005525262486
0.012958842807103098
