## Import and format json data
Importing playlists from jsons and formatting them into ratings data frame. (Get data from https://owncloud.tuwien.ac.at/index.php/s/A8Wx2TpFr0WznZh)

In [1]:
import pandas as pd
import numpy as np
import json

# load data
train = json.load(open('data/train.json', encoding = "utf8"))

dev = json.load(open('data/dev.json', encoding = "utf8"))
dev_trun = json.load(open('data/dev_trun.json', encoding="utf8"))

test = json.load(open('data/test.json', encoding = "utf8"))
test_trun = json.load(open('data/test_trun.json', encoding = "utf8"))

# create ratings df with training + dev for validation (equivalent to 'ratings_train' data frame in Assignment 4.1)
rating_list = []
only_dev_list = []

seen = set()
track_list = []
track_pnames_dev_dict=dict()

dev_playlist_pids = []

for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_dev_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_dev_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_dev_dict[track['track_uri']] = l
            
for playlist in dev_trun['playlists']:
    dev_playlist_pids.append(playlist['pid'])
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        only_dev_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_dev_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_dev_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_dev_dict[track['track_uri']] = l
            
ratings_train_dev = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])
ratings_only_dev = pd.DataFrame(only_dev_list, columns = ['pid', 'name', 'track_uri', 'rating'])
tracks_dev = pd.DataFrame(track_list, columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])

dev_playlist_pids = list(set(dev_playlist_pids))
print(len(dev_playlist_pids))

# create ratings df for testing
rating_list = []
only_test_list = []

seen = set()
track_list = []
track_pnames_test_dict=dict()

test_playlist_pids = []

for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_test_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_test_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_test_dict[track['track_uri']] = l
            
for playlist in test_trun['playlists']:
    test_playlist_pids.append(playlist['pid'])
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        only_test_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_test_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_test_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_test_dict[track['track_uri']] = l
            
ratings_train_test = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])
ratings_only_test = pd.DataFrame(only_test_list, columns = ['pid', 'name', 'track_uri', 'rating'])
tracks_test = pd.DataFrame(track_list, columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])

test_playlist_pids = list(set(test_playlist_pids))
print(len(test_playlist_pids))

# holdouts ('topN' equivalent)
dev_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in dev['playlists']}
test_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in test['playlists']}

#ratings_train_dev.head()
tracks_dev.head()
#track_pnames_dev_dict

# not sure if train and dev need to be combined.. train alone doesn't miss any tracks, 
# so we cannot test the model on it. Combining it with def, we can make predictions on
# missing tracks in dev playlists. EDIT: I think combining is right. Also combining train + test. 
# Otherwise, train isn't part of the model.


100
100


Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms
0,spotify:track:2CFGQXM3exBO8U9WGve2GY,Elle me dit,spotify:artist:5MmVJVhhYKQ86izuGHzJYA,MIKA,spotify:album:0zY3JRYHC0rycbFUqGcRpV,The Origin Of Love,216933
1,spotify:track:09TcIuH1ZO7i4vicWKoaN2,Papaoutai,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,232146
2,spotify:track:3Uyt0WO3wOopnUBCe9BaXl,Sur ma route,spotify:artist:7lMgpN1tEBQKpRoUMKB8iw,Black M,spotify:album:1D2Rs9qcENebbiDR5wk88T,Les yeux plus gros que le monde,252573
3,spotify:track:4h95OrjKWYzVIai1EkvZbt,Ta fÃªte,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,175426
4,spotify:track:3pCN6X0566rmU6P5ZgFIYu,bÃ¢tard,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,208826


### Add playlist information to the tracks DataFrame

In [2]:
ratings_train = ratings_train_dev

tracks_dev = tracks_dev.assign(pids=pd.Series(' '))

# Drop duplicates
import collections
track_ids = tracks_dev['track_uri'].tolist()
track_ids_dup = [x for  x, y in collections.Counter(track_ids).items() if y > 1]

for track_id in track_ids_dup:
    to_drop = tracks_dev.index[tracks_dev.track_uri == track_id].tolist()[1:]
    tracks_dev.drop(to_drop, inplace=True)


for i, t in tracks_dev.iterrows():
    #pnames_list = ratings_train.loc[ratings_train['track_uri'] == t.track_uri][['name']]['name'].tolist()
    pnames_list = track_pnames_dev_dict.get(t.track_uri)
    tracks_dev.at[i, 'pids'] = ' '.join(pnames_list)

tracks_dev.head()    

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms,pids
0,spotify:track:2CFGQXM3exBO8U9WGve2GY,Elle me dit,spotify:artist:5MmVJVhhYKQ86izuGHzJYA,MIKA,spotify:album:0zY3JRYHC0rycbFUqGcRpV,The Origin Of Love,216933,FRANCAIS Playlist 2015 Ben Chansons franÃ§aises...
1,spotify:track:09TcIuH1ZO7i4vicWKoaN2,Papaoutai,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,232146,FRANCAIS EA ( Í¡Â° ÍœÊ– Í¡Â°) Long Drives PARTY Ch...
2,spotify:track:3Uyt0WO3wOopnUBCe9BaXl,Sur ma route,spotify:artist:7lMgpN1tEBQKpRoUMKB8iw,Black M,spotify:album:1D2Rs9qcENebbiDR5wk88T,Les yeux plus gros que le monde,252573,FRANCAIS french Chansons franÃ§aises
3,spotify:track:4h95OrjKWYzVIai1EkvZbt,Ta fÃªte,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,175426,FRANCAIS International ðŸŽ¶ðŸŽ¶ Ya
4,spotify:track:3pCN6X0566rmU6P5ZgFIYu,bÃ¢tard,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,208826,FRANCAIS Long Drives International


In [3]:
ratings_train = ratings_train_test

tracks_test = tracks_test.assign(pids=pd.Series(' '))

# Drop duplicates
import collections
track_ids = tracks_test['track_uri'].tolist()
track_ids_dup = [x for  x, y in collections.Counter(track_ids).items() if y > 1]

for track_id in track_ids_dup:
    to_drop = tracks_test.index[tracks_test.track_uri == track_id].tolist()[1:]
    tracks_test.drop(to_drop, inplace=True)


for i, t in tracks_dev.iterrows():
    #pnames_list = ratings_train.loc[ratings_train['track_uri'] == t.track_uri][['name']]['name'].tolist()
    pnames_list = track_pnames_test_dict.get(t.track_uri)
    tracks_test.at[i, 'pids'] = ' '.join(pnames_list)

tracks_test.head()    

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms,pids
0,spotify:track:2CFGQXM3exBO8U9WGve2GY,Elle me dit,spotify:artist:5MmVJVhhYKQ86izuGHzJYA,MIKA,spotify:album:0zY3JRYHC0rycbFUqGcRpV,The Origin Of Love,216933,FRANCAIS Playlist 2015 Ben Chansons franÃ§aises...
1,spotify:track:09TcIuH1ZO7i4vicWKoaN2,Papaoutai,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,232146,FRANCAIS EA ( Í¡Â° ÍœÊ– Í¡Â°) Long Drives PARTY Ch...
2,spotify:track:3Uyt0WO3wOopnUBCe9BaXl,Sur ma route,spotify:artist:7lMgpN1tEBQKpRoUMKB8iw,Black M,spotify:album:1D2Rs9qcENebbiDR5wk88T,Les yeux plus gros que le monde,252573,FRANCAIS french Chansons franÃ§aises
3,spotify:track:4h95OrjKWYzVIai1EkvZbt,Ta fÃªte,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,175426,FRANCAIS International ðŸŽ¶ðŸŽ¶ Ya
4,spotify:track:3pCN6X0566rmU6P5ZgFIYu,bÃ¢tard,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrÃ©e,208826,FRANCAIS Long Drives International


## Import and build recommenders

In [4]:
%load_ext autoreload
%autoreload 2
from Recommender_CB import ContentBasedRecommender
from Recommender_CF_UU import UUCFRecommender
from Recommender_MF import MFRecommender

Build models

In [5]:

## UU-CB
#uucf = UUCFRecommender()
#uucf.build_model(ratings_train_dev)

In [6]:
## Content-based 
cbr_extended = ContentBasedRecommender('extended')
#%time cbr_extended.build_model(ratings_only_dev, tracks_dev)
%time cbr_extended.build_model(ratings_train_dev, tracks_dev)

cbr_general = ContentBasedRecommender('general')
#%time cbr_general.build_model(ratings_only_dev, tracks_dev)
%time cbr_extended.build_model(ratings_train_dev, tracks_dev)

ok
ok playlist profiles
Wall time: 16min 33s
ok
ok playlist profiles
Wall time: 16min 40s


In [7]:
## sample

#%time cbr_extended_recs = cbr_extended.recommend(498917, topN = dev_holdouts.get(498917))
#print(cbr_extended_recs)

#%time cbr_general_recs = cbr_general.recommend(498917, topN = dev_holdouts.get(498917))
#print(cbr_general_recs)

## Parameter tuning
Parameters are tuned through evaluating performance of different parameter settings on the train+dev (?) set. Those settings will be used for the test set.

In [8]:
## Load ground truth

import metrics
infile = 'data/dev_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        
#target_sets

In [9]:
## testing one recommendation

#target_set = target_sets.get('498917')

#k = dev_holdouts.get(498917)

#TODO: Evaluate results:
# CB Accuracy
#print(metrics.r_precision(target_set, cbr_extended_recs, k))
#print(metrics.r_precision(target_set, cbr_general_recs, k))
#CB NDCG
#print(metrics.ndcg(target_set, cbr_extended_recs, k))
#print(metrics.ndcg(target_set, cbr_general_recs, k))


#TODO: Parameter tuning
## Content-based 
#cbr = ContentBasedRecommender('extended')

In [None]:
## Running extended and general CB recommendation on a subset of dev playlists and taking the one with the better average

# extended
file_output = []
R_precisions = {}
NDCGs = {}
R_list = []
NDCG_list = []

print("Extended Dev------------------------", flush=True)
for pid in dev_playlist_pids[:5]:  
    %time cbr_extended_recs = cbr_extended.recommend(pid, topN = dev_holdouts.get(pid))
    R_precisions[pid] = metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, dev_holdouts.get(pid))
    NDCGs[pid] = metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, dev_holdouts.get(pid))
    #--- for average calc
    R_list.append(metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, dev_holdouts.get(pid)))
    NDCG_list.append(metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, dev_holdouts.get(pid)))
    #---
    to_append_to_file = [pid]
    to_append_to_file.extend([cbr_extended_recs])
    file_output.append(to_append_to_file)
    print('pid: ', pid, 'r_prec: ', R_precisions[pid], ' ', 'NDCG: ', NDCGs[pid], flush=True)

print('Average r_prec: ', sum(R_list) / float(len(R_list)), ' ', 'Average NDCG: ', sum(NDCG_list) / float(len(NDCG_list)), flush=True)
print(file_output, flush=True)


# general
file_output = []
R_precisions = {}
NDCGs = {}
R_list = []
NDCG_list = []

print("General Dev------------------------", flush=True)
for pid in dev_playlist_pids[:5]:  
    %time cbr_general_recs = cbr_general.recommend(pid, topN = dev_holdouts.get(pid))
    R_precisions[pid] = metrics.r_precision(target_sets.get(str(pid)), cbr_general_recs, dev_holdouts.get(pid))
    NDCGs[pid] = metrics.ndcg(target_sets.get(str(pid)), cbr_general_recs, dev_holdouts.get(pid))
    #--- for average calc
    R_list.append(metrics.r_precision(target_sets.get(str(pid)), cbr_general_recs, dev_holdouts.get(pid)))
    NDCG_list.append(metrics.ndcg(target_sets.get(str(pid)), cbr_general_recs, dev_holdouts.get(pid)))
    #---
    to_append_to_file = [pid]
    to_append_to_file.extend([cbr_general_recs])
    file_output.append(to_append_to_file)
    print('pid: ', pid, 'r_prec: ', R_precisions[pid], ' ', 'NDCG: ', NDCGs[pid], flush=True)

print('Average r_prec: ', sum(R_list) / float(len(R_list)), ' ', 'Average NDCG: ', sum(NDCG_list) / float(len(NDCG_list)), flush=True)  
print(file_output, flush=True)


Extended Dev------------------------


## Evaluation on test set and writing to file

In [None]:
## Load ground truth

import metrics
infile = 'data/test_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        

In [None]:
## Content-based

import  csv

# build model
cbr_extended = ContentBasedRecommender('extended')
cbr_extended.build_model(ratings_train_test, tracks_test)


# recommend for var#to_evaluate playlists in test set
to_evaluate = 10
file_output = []
R_precisions = {}
NDCGs = {}
R_list = []
NDCG_list = []

print("Extended Test------------------------", flush=True)
for pid in test_playlist_pids[:to_evaluate]:  
    %time cbr_extended_recs = cbr_extended.recommend(pid, topN = test_holdouts.get(pid))
    R_precisions[pid] = metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, test_holdouts.get(pid))
    NDCGs[pid] = metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, test_holdouts.get(pid))
    #--- for average calc
    R_list.append(metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, test_holdouts.get(pid)))
    NDCG_list.append(metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, test_holdouts.get(pid)))
    #---
    to_append_to_file = [pid]
    to_append_to_file.extend([cbr_extended_recs])
    file_output.append(to_append_to_file)
    print('pid: ', pid, 'r_prec: ', R_precisions[pid], ' ', 'NDCG: ', NDCGs[pid], flush=True)

print('Average r_prec: ', sum(R_list) / float(len(R_list)), ' ', 'Average NDCG: ', sum(NDCG_list) / float(len(NDCG_list)), flush=True)
print(file_output, flush=True)

with open('data/test_cb_extended_recommendations', 'w') as output:
    wr = csv.writer(output)
    wr.writerows(file_output)


# general
file_output = []
R_precisions = {}
NDCGs = {}
R_list = []
NDCG_list = []

print("General Test------------------------", flush=True)
for pid in test_playlist_pids[:to_evaluate]:  
    %time cbr_general_recs = cbr_general.recommend(pid, topN = test_holdouts.get(pid))
    R_precisions[pid] = metrics.r_precision(target_sets.get(str(pid)), cbr_general_recs, k)
    NDCGs[pid] = metrics.ndcg(target_sets.get(str(pid)), cbr_general_recs, k)
    #--- for average calc
    R_list.append(metrics.r_precision(target_sets.get(str(pid)), cbr_general_recs, k))
    NDCG_list.append(metrics.ndcg(target_sets.get(str(pid)), cbr_general_recs, k))
    #---
    to_append_to_file = [pid]
    to_append_to_file.extend([cbr_general_recs])
    file_output.append(to_append_to_file)
    print('pid: ', pid, 'r_prec: ', R_precisions[pid], ' ', 'NDCG: ', NDCGs[pid], flush=True)

print('Average r_prec: ', sum(R_list) / float(len(R_list)), ' ', 'Average NDCG: ', sum(NDCG_list) / float(len(NDCG_list)), flush=True)
print(file_output, flush=True)

with open('data/test_cb_general_recommendations', 'w') as output:
    wr = csv.writer(output)
    wr.writerows(file_output)
    


In [None]:
## UU-CF


# build model
#cbr_extended = ContentBasedRecommender('extended')
#cbr_extended.build_model(ratings_train_test, tracks_test)

# recommend for var#to_evaluate playlists in test set
#to_evaluate = 10
#file_output = []
#R_precisions = {}
#NDCGs = {}
#R_list = []
#NDCG_list[]

#print("Extended")
#for pid in dev_playlist_pids[:to_evaluate]:  
#    %time cbr_extended_recs = cbr_extended.recommend(pid, topN = test_holdouts.get(pid))
#    R_precisions[pid] = metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, k)
#    NDCGs[pid] = metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, k)
#    #--- for average calc
#    R_list.append(metrics.r_precision(target_sets.get(str(pid)), cbr_extended_recs, k))
#    NDCG_list.append(metrics.ndcg(target_sets.get(str(pid)), cbr_extended_recs, k))
#    #---
#    to_append_to_file = [pid]
#    to_append_to_file.extend([cbr_extended_recs])
#    file_output.append(to_append_to_file)
#    print('pid: ', pid, 'r_prec: ', R_precisions[pid], ' ', 'NDCG: ', NDCGs[pid])

#print('Average r_prec: ', sum(R_list) / float(len(R_list)), ' ', 'Average NDCG: ', sum(NDCG_list) / float(len(NDCG_list)))
#print(file_output)

#with open('data/test_cb_recommendations', 'w') as output:
#    wr = csv.writer(output)
#    wr.writerows(file_output)
    
