## Import and format json data
Importing playlists from jsons and formatting them into ratings data frame. (Get data from https://owncloud.tuwien.ac.at/index.php/s/A8Wx2TpFr0WznZh)

In [57]:
import pandas as pd
import numpy as np
import json

# load data
train = json.load(open('data/train.json', encoding = "utf8"))

dev = json.load(open('data/dev.json', encoding = "utf8"))
dev_trun = json.load(open('data/dev_trun.json', encoding="utf8"))

test = json.load(open('data/test.json', encoding = "utf8"))
test_trun = json.load(open('data/test_trun.json', encoding = "utf8"))

# create ratings df with training + dev for validation (equivalent to 'ratings_train' data frame in Assignment 4.1)
rating_list = []
only_dev_list = []

seen = set()
track_list = []
for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        #if track['track_uri'] not in seen:
        #    seen.add(track['track_uri'])
        #    track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])

for playlist in dev_trun['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        only_dev_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
        if track['track_uri'] not in seen:
            seen.add(track['track_uri'])
            track_list.append([track['track_uri'], track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'], track['album_name'], track['duration_ms']])

ratings_train_dev = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])
ratings_only_dev = pd.DataFrame(only_dev_list, columns = ['pid', 'name', 'track_uri', 'rating'])
tracks = pd.DataFrame(track_list, columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name', 'duration_ms'])

# create ratings df for testing
rating_list = []
for playlist in train['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])
for playlist in test_trun['playlists']:
    for track in playlist['tracks']:
        rating_list.append([playlist['pid'], playlist['name'], track['track_uri'], 1.0])

ratings_train_test = pd.DataFrame(rating_list, columns = ['pid', 'name', 'track_uri', 'rating'])

# holdouts ('topN' equivalent)
dev_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in dev['playlists']}
test_holdouts = {playlist['pid']:playlist['num_holdouts'] for playlist in test['playlists']}

#ratings_train_dev.head()
tracks.head()

# not sure if train and dev need to be combined.. train alone doesn't miss any tracks, 
# so we cannot test the model on it. Combining it with def, we can make predictions on
# missing tracks in dev playlists. EDIT: I think combining is right. Also combining train + test. 
# Otherwise, train isn't part of the model.


Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms
0,spotify:track:1kgLlA9D73mZ7XtjLP7Kzu,Admission To Your Party,spotify:artist:4iiQabGKtS2RtTKpVkrVTw,Smallpools,spotify:album:59xqFRG2IgFTsZtQ73yIp6,LOVETAP!,178266
1,spotify:track:2uV2tyFZ0Eex2Lsc8shIfN,Karaoke,spotify:artist:4iiQabGKtS2RtTKpVkrVTw,Smallpools,spotify:album:59xqFRG2IgFTsZtQ73yIp6,LOVETAP!,209933
2,spotify:track:2Z8WuEywRWYTKe1NybPQEW,Ride,spotify:artist:3YQKmKGau1PzlVlkL1iodx,Twenty One Pilots,spotify:album:3cQO7jp5S9qLBoIVtbkSM1,Blurryface,214506
3,spotify:track:3E2Zh20GDCR9B1EYjfXWyv,Weak,spotify:artist:6s22t5Y3prQHyaHWUN1R1C,AJR,spotify:album:7LACXphpLTluKLFqHIZ1Qq,The Click,201160
4,spotify:track:7KxhSJOYiqCDclXDBNlFSZ,Kill Em With Kindness,spotify:artist:0C8ZW7ezQVs4URX5aX7Kqx,Selena Gomez,spotify:album:3Kbuu2tHsIbplFUkB7a5oE,Revival,217906


### Add playlist information to the tracks DataFrame

In [58]:
ratings_train = ratings_only_dev
#ratings_train = ratings_train_dev

In [76]:
tracks = tracks.assign(pids=pd.Series(' '))

# Drop duplicates
import collections
track_ids = tracks['track_uri'].tolist()
track_ids_dup = [x for  x, y in collections.Counter(track_ids).items() if y > 1]

for track_id in track_ids_dup:
    to_drop = tracks.index[tracks.track_uri == track_id].tolist()[1:]
    tracks.drop(to_drop, inplace=True)


for i, t in tracks.iterrows():
    pnames_list = ratings_train.loc[ratings_train['track_uri'] == t.track_uri][['name']]['name'].tolist()
    tracks.at[i, 'pids'] = ' '.join(pnames_list)

tracks.head()    

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,duration_ms,pids
0,spotify:track:1kgLlA9D73mZ7XtjLP7Kzu,Admission To Your Party,spotify:artist:4iiQabGKtS2RtTKpVkrVTw,Smallpools,spotify:album:59xqFRG2IgFTsZtQ73yIp6,LOVETAP!,178266,Partyy
1,spotify:track:2uV2tyFZ0Eex2Lsc8shIfN,Karaoke,spotify:artist:4iiQabGKtS2RtTKpVkrVTw,Smallpools,spotify:album:59xqFRG2IgFTsZtQ73yIp6,LOVETAP!,209933,Partyy
2,spotify:track:2Z8WuEywRWYTKe1NybPQEW,Ride,spotify:artist:3YQKmKGau1PzlVlkL1iodx,Twenty One Pilots,spotify:album:3cQO7jp5S9qLBoIVtbkSM1,Blurryface,214506,Partyy life
3,spotify:track:3E2Zh20GDCR9B1EYjfXWyv,Weak,spotify:artist:6s22t5Y3prQHyaHWUN1R1C,AJR,spotify:album:7LACXphpLTluKLFqHIZ1Qq,The Click,201160,Partyy
4,spotify:track:7KxhSJOYiqCDclXDBNlFSZ,Kill Em With Kindness,spotify:artist:0C8ZW7ezQVs4URX5aX7Kqx,Selena Gomez,spotify:album:3Kbuu2tHsIbplFUkB7a5oE,Revival,217906,Partyy PLANE


## Import and test recommenders

In [77]:
%load_ext autoreload
%autoreload 2
from Recommender_CB import ContentBasedRecommender
from Recommender_CF_UU import UUCFRecommender
from Recommender_MF import MFRecommender

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
## Matrix Fac.
mfr = MFRecommender()
mfr.build_model(ratings_train_dev)

recs = mfr.recommend('498917', topN = holdouts['498917'])
print(recs)


## UU-CB
uucf = UUCFRecommender()
uucf.build_model(ratings_train_dev)

recs = uucf.recommend('498917', topN = holdouts['498917'])
print(recs)

AttributeError: 'DataFrame' object has no attribute 'movieId'

In [88]:
## Content-based 
cbr_extended = ContentBasedRecommender('extended')
cbr_extended.build_model(ratings_only_dev, tracks)
#cbr_extended.build_model(ratings_train_dev, tracks)

cbr_general = ContentBasedRecommender('general')
cbr_general.build_model(ratings_only_dev, tracks)
#cbr_extended.build_model(ratings_train_dev, tracks)

ok
ok playlist profiles
ok
ok playlist profiles


In [89]:
cbr_extended_recs = cbr_extended.recommend(498917, topN = dev_holdouts.get(498917))
print(cb_extended_recs)

cbr_general_recs = cbr_general.recommend(498917, topN = dev_holdouts.get(498917))
print(cbr_general_recs)

['spotify:track:0pBLfQ5JBjh12H6DGZjMwM', 'spotify:track:6ZANrVuAMp2rpjhfbOuJly', 'spotify:track:7pJgjBf82BrUQ3z7HdQvW1', 'spotify:track:37IToWWYUc9nuH3ijz4tqV', 'spotify:track:3CJvmtWw2bJsudbAC5uCQk', 'spotify:track:3IMXLQk09YmvNb2tdT4aWk', 'spotify:track:3viZcRlE38APncWJ5VGM4q', 'spotify:track:29DwxnpFOLlm1fKZOitwAA', 'spotify:track:3LuJPqTyPuVfm9VemXTWHo', 'spotify:track:30lgD1UuHczwlxa7NZFeSQ', 'spotify:track:5P3yUXUC9rZPJPNmYGKEAz', 'spotify:track:3CRDbSIZ4r5MsZ0YwxuEkn', 'spotify:track:7qxjGHW485TL8ciwkHD5MK', 'spotify:track:6GmkJJMe9U1tEcrJ3Hq3A1', 'spotify:track:40sRwFjfrRnaTdYPJIg4CP', 'spotify:track:4Oyl6oYSNeeZZP0OAxPVaU', 'spotify:track:57yL3161hUMuw06zzzUCHi', 'spotify:track:7i9763l5SSfOnqZ35VOcfy', 'spotify:track:22OqansuvEMQu9W0EfiGTI', 'spotify:track:5U2m76qAMdr3gXI0g6ptlG', 'spotify:track:4dzOxPZFYX77xpMwv4EhnO', 'spotify:track:5Nhsc59aScz67qNXA0y9Mn', 'spotify:track:4hAhyE1iL07EIUJnqw2bsV', 'spotify:track:1lbWbnWiEbAya5FlCzfsrq', 'spotify:track:2E26jtkc5BONr50yOwyupV',

## Parameter tuning
Parameters are tuned through evaluating performance of different parameter settings on the train+dev (?) set. Those settings will be used for the test set.

In [90]:
import metrics
infile = 'data/dev_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        
#target_sets

In [91]:
target_set = target_sets.get('498917')

#TODO: Evaluate results:
print(metrics.r_precision(target_set, cbr_extended_recs, dev_holdouts.get(498917)))
print(metrics.r_precision(target_set, cbr_general_recs, dev_holdouts.get(498917)))
    
#TODO: Parameter tuning
## Content-based 
cbr = ContentBasedRecommender('extended')

0.0
0.0


## Evaluation on test set 

In [33]:
import metrics
infile = 'data/test_gt.txt'
split_pits = None
with open(infile, 'rt') as f_i:
    split_pits = [z.strip(' ()') for z in f_i.read().strip().split('\n')]

target_sets = dict()
if split_pits:
    for tup in split_pits:
        k = tup.split(', ')[0]
        v = list()
        for z in tup.split(', ')[1:]:
            v.append(z.strip(' '))
        target_sets[k] = v
        
#target_sets

In [None]:
#TODO: evaluation on test set


# Content-based
#cbr.build_model(ratings_train_test, tracks)
#recs = cbr.recommend(498917, topN = test_holdouts.get(498917))
metrics.r_precision(target_set, cb_recs)