## Import and format json data
Importing playlists from jsons and formatting them into ratings data frame. (Get data from https://owncloud.tuwien.ac.at/index.php/s/A8Wx2TpFr0WznZh)

In [100]:
import pandas as pd
import numpy as np
import json

# load data
train = json.load(open('data/train.json', encoding = "utf8"))

dev = json.load(open('data/dev.json', encoding = "utf8"))
dev_trun = json.load(open('data/dev_trun.json', encoding="utf8"))

test = json.load(open('data/test.json', encoding = "utf8"))
test_trun = json.load(open('data/test_trun.json', encoding = "utf8"))

# create ratings df with training + dev for validation (equivalent to 'ratings_train' data frame in Assignment 4.1)
rating_list = []
only_dev_list = []

seen = set()
track_list = []
track_pnames_dev_dict=dict()

for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_dev_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_dev_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_dev_dict[track['track_uri']] = l
            
for playlist in dev_trun['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        only_dev_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_dev_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_dev_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_dev_dict[track['track_uri']] = l
            
ratings_train_dev = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])
ratings_only_dev = pd.DataFrame(only_dev_list, columns = ['pid', 'name', 'track_uri', 'rating'])
tracks_dev = pd.DataFrame(track_list, columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])

# create ratings df for testing
rating_list = []
only_test_list = []

seen = set()
track_list = []
track_pnames_test_dict=dict()

for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_test_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_test_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_test_dict[track['track_uri']] = l
            
for playlist in test_trun['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        only_test_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_pnames_test_dict[track['track_uri']] =  [playlist['name']]
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])
        else:
            l = track_pnames_test_dict[track['track_uri']]
            l.append(playlist['name'])
            track_pnames_test_dict[track['track_uri']] = l
            
ratings_train_test = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])
ratings_only_test = pd.DataFrame(only_test_list, columns = ['pid', 'name', 'track_uri', 'rating'])
tracks_test = pd.DataFrame(track_list, columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])

# holdouts ('topN' equivalent)
dev_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in dev['playlists']}
test_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in test['playlists']}

#ratings_train_dev.head()
#tracks_dev.head()
#track_pnames_dev_dict

# not sure if train and dev need to be combined.. train alone doesn't miss any tracks, 
# so we cannot test the model on it. Combining it with def, we can make predictions on
# missing tracks in dev playlists. EDIT: I think combining is right. Also combining train + test. 
# Otherwise, train isn't part of the model.


{'spotify:track:2CFGQXM3exBO8U9WGve2GY': ['FRANCAIS',
  'Playlist 2015',
  'Ben',
  'Chansons françaises',
  'Currently'],
 'spotify:track:09TcIuH1ZO7i4vicWKoaN2': ['FRANCAIS',
  'EA',
  ' ( ͡° ͜ʖ ͡°) ',
  'Long Drives',
  'PARTY',
  'Chill',
  'Hype playlist',
  '2014',
  'International',
  'Alvaro',
  'kids party',
  'General',
  'Spring',
  'poppin',
  'Electronic',
  'nutella',
  'Chansons françaises',
  'Around the World',
  'January 2014',
  'Currently',
  'Ya',
  'Dance',
  'Pregame'],
 'spotify:track:3Uyt0WO3wOopnUBCe9BaXl': ['FRANCAIS',
  'french',
  'Chansons françaises'],
 'spotify:track:4h95OrjKWYzVIai1EkvZbt': ['FRANCAIS',
  'International',
  '🎶🎶',
  'Ya'],
 'spotify:track:3pCN6X0566rmU6P5ZgFIYu': ['FRANCAIS',
  'Long Drives',
  'International'],
 'spotify:track:6rfKtMnWCmZnRQuyGe5KM2': ['FRANCAIS', 'MEXICO', "CRUISIN' "],
 'spotify:track:6rksq8tMk1eUQ3gas0NO1v': ['FRANCAIS'],
 'spotify:track:1vELwZM3uBYWd8HrtddD4l': ['FRANCAIS',
  'french',
  'General',
  'Español'],
 's

### Add playlist information to the tracks DataFrame

In [104]:
ratings_train = ratings_train_dev

tracks_dev = tracks_dev.assign(pids=pd.Series(' '))

# Drop duplicates
import collections
track_ids = tracks_dev['track_uri'].tolist()
track_ids_dup = [x for  x, y in collections.Counter(track_ids).items() if y > 1]

for track_id in track_ids_dup:
    to_drop = tracks.index[tracks_dev.track_uri == track_id].tolist()[1:]
    tracks_dev.drop(to_drop, inplace=True)


for i, t in tracks_dev.iterrows():
    #pnames_list = ratings_train.loc[ratings_train['track_uri'] == t.track_uri][['name']]['name'].tolist()
    pnames_list = track_pnames_dev_dict.get(t.track_uri)
    tracks_dev.at[i, 'pids'] = ' '.join(pnames_list)

tracks_dev.head()    

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms,pids
0,spotify:track:2CFGQXM3exBO8U9WGve2GY,Elle me dit,spotify:artist:5MmVJVhhYKQ86izuGHzJYA,MIKA,spotify:album:0zY3JRYHC0rycbFUqGcRpV,The Origin Of Love,216933,FRANCAIS Playlist 2015 Ben Chansons françaises...
1,spotify:track:09TcIuH1ZO7i4vicWKoaN2,Papaoutai,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,232146,FRANCAIS EA ( ͡° ͜ʖ ͡°) Long Drives PARTY Ch...
2,spotify:track:3Uyt0WO3wOopnUBCe9BaXl,Sur ma route,spotify:artist:7lMgpN1tEBQKpRoUMKB8iw,Black M,spotify:album:1D2Rs9qcENebbiDR5wk88T,Les yeux plus gros que le monde,252573,FRANCAIS french Chansons françaises
3,spotify:track:4h95OrjKWYzVIai1EkvZbt,Ta fête,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,175426,FRANCAIS International 🎶🎶 Ya
4,spotify:track:3pCN6X0566rmU6P5ZgFIYu,bâtard,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,208826,FRANCAIS Long Drives International


In [110]:
ratings_train = ratings_train_test

tracks_test = tracks_test.assign(pids=pd.Series(' '))

# Drop duplicates
import collections
track_ids = tracks_test['track_uri'].tolist()
track_ids_dup = [x for  x, y in collections.Counter(track_ids).items() if y > 1]

for track_id in track_ids_dup:
    to_drop = tracks_test.index[tracks_test.track_uri == track_id].tolist()[1:]
    tracks_test.drop(to_drop, inplace=True)


for i, t in tracks_dev.iterrows():
    #pnames_list = ratings_train.loc[ratings_train['track_uri'] == t.track_uri][['name']]['name'].tolist()
    pnames_list = track_pnames_test_dict.get(t.track_uri)
    tracks_test.at[i, 'pids'] = ' '.join(pnames_list)

tracks_test.head()    

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms,pids
0,spotify:track:2CFGQXM3exBO8U9WGve2GY,Elle me dit,spotify:artist:5MmVJVhhYKQ86izuGHzJYA,MIKA,spotify:album:0zY3JRYHC0rycbFUqGcRpV,The Origin Of Love,216933,FRANCAIS Playlist 2015 Ben Chansons françaises...
1,spotify:track:09TcIuH1ZO7i4vicWKoaN2,Papaoutai,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,232146,FRANCAIS EA ( ͡° ͜ʖ ͡°) Long Drives PARTY Ch...
2,spotify:track:3Uyt0WO3wOopnUBCe9BaXl,Sur ma route,spotify:artist:7lMgpN1tEBQKpRoUMKB8iw,Black M,spotify:album:1D2Rs9qcENebbiDR5wk88T,Les yeux plus gros que le monde,252573,FRANCAIS french Chansons françaises
3,spotify:track:4h95OrjKWYzVIai1EkvZbt,Ta fête,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,175426,FRANCAIS International 🎶🎶 Ya
4,spotify:track:3pCN6X0566rmU6P5ZgFIYu,bâtard,spotify:artist:5j4HeCoUlzhfWtjAfM1acR,Stromae,spotify:album:4WW3ME0M2dUoAK5hmbXsN7,racine carrée,208826,FRANCAIS Long Drives International


## Import and test recommenders

In [105]:
%load_ext autoreload
%autoreload 2
from Recommender_CB import ContentBasedRecommender
from Recommender_CF_UU import UUCFRecommender
from Recommender_MF import MFRecommender

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
## Matrix Fac.
mfr = MFRecommender()
mfr.build_model(ratings_train_dev)

recs = mfr.recommend('498917', topN = holdouts['498917'])
print(recs)


## UU-CB
uucf = UUCFRecommender()
uucf.build_model(ratings_train_dev)

recs = uucf.recommend('498917', topN = holdouts['498917'])
print(recs)

AttributeError: 'DataFrame' object has no attribute 'movieId'

In [123]:
## Content-based 
cbr_extended = ContentBasedRecommender('extended')
%time cbr_extended.build_model(ratings_only_dev, tracks_dev)
#cbr_extended.build_model(ratings_train_dev, tracks)

cbr_general = ContentBasedRecommender('general')
%time cbr_general.build_model(ratings_only_dev, tracks_dev)
#cbr_extended.build_model(ratings_train_dev, tracks)

ok
ok playlist profiles
Wall time: 27.7 s
ok
ok playlist profiles
Wall time: 26.5 s


In [107]:
cbr_extended_recs = cbr_extended.recommend(498917, topN = dev_holdouts.get(498917))
print(cb_extended_recs)

cbr_general_recs = cbr_general.recommend(498917, topN = dev_holdouts.get(498917))
print(cbr_general_recs)

['spotify:track:0pBLfQ5JBjh12H6DGZjMwM', 'spotify:track:6ZANrVuAMp2rpjhfbOuJly', 'spotify:track:7pJgjBf82BrUQ3z7HdQvW1', 'spotify:track:37IToWWYUc9nuH3ijz4tqV', 'spotify:track:3CJvmtWw2bJsudbAC5uCQk', 'spotify:track:3IMXLQk09YmvNb2tdT4aWk', 'spotify:track:3viZcRlE38APncWJ5VGM4q', 'spotify:track:29DwxnpFOLlm1fKZOitwAA', 'spotify:track:3LuJPqTyPuVfm9VemXTWHo', 'spotify:track:30lgD1UuHczwlxa7NZFeSQ', 'spotify:track:5P3yUXUC9rZPJPNmYGKEAz', 'spotify:track:3CRDbSIZ4r5MsZ0YwxuEkn', 'spotify:track:7qxjGHW485TL8ciwkHD5MK', 'spotify:track:6GmkJJMe9U1tEcrJ3Hq3A1', 'spotify:track:40sRwFjfrRnaTdYPJIg4CP', 'spotify:track:4Oyl6oYSNeeZZP0OAxPVaU', 'spotify:track:57yL3161hUMuw06zzzUCHi', 'spotify:track:7i9763l5SSfOnqZ35VOcfy', 'spotify:track:22OqansuvEMQu9W0EfiGTI', 'spotify:track:5U2m76qAMdr3gXI0g6ptlG', 'spotify:track:4dzOxPZFYX77xpMwv4EhnO', 'spotify:track:5Nhsc59aScz67qNXA0y9Mn', 'spotify:track:4hAhyE1iL07EIUJnqw2bsV', 'spotify:track:1lbWbnWiEbAya5FlCzfsrq', 'spotify:track:2E26jtkc5BONr50yOwyupV',

## Parameter tuning
Parameters are tuned through evaluating performance of different parameter settings on the train+dev (?) set. Those settings will be used for the test set.

In [118]:
import metrics
infile = 'data/dev_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        
#target_sets

In [122]:
target_set = target_sets.get('498917')

k = dev_holdouts.get(498917)

#TODO: Evaluate results:
# CB Accuracy
print(metrics.r_precision(target_set, cbr_extended_recs, k))
print(metrics.r_precision(target_set, cbr_general_recs, k))
#CB NDCG
print(metrics.ndcg(target_set, cbr_extended_recs, k))
print(metrics.ndcg(target_set, cbr_general_recs, k))


#TODO: Parameter tuning
## Content-based 
cbr = ContentBasedRecommender('extended')

0.06
0.06
0.11120472615632955
0.11157986515839943


## Evaluation on test set 

In [111]:
import metrics
infile = 'data/test_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        
#target_sets

In [113]:
#TODO: evaluation on test set
target_set = target_sets.get('498917')


# Content-based
cbr.build_model(ratings_train_test, tracks_test)
cb_recs = cbr.recommend(498917, topN = test_holdouts.get(498917))
metrics.r_precision(target_set, cb_recs)

ok


KeyboardInterrupt: 