## Sklearn Model
VAEs are not going to be reasonable. Testing out some sklearn models as baselines.

In [10]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spotipy
import pickle

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import *

### Data Input

Example entry in a playlist file:

```
"info": {
        "generated_on": "2017-12-03 08:41:42.057563", 
        "slice": "0-999", 
        "version": "v1"
    }, 
"playlists": [
    {
        "name": "Throwbacks", 
        "collaborative": "false", 
        "pid": 0, 
        "modified_at": 1493424000, 
        "num_tracks": 52, 
        "num_albums": 47, 
        "num_followers": 1, 
        "tracks": [
            {
                "pos": 0, 
                "artist_name": "Missy Elliott", 
                "track_uri": "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI", 
                "artist_uri": "spotify:artist:2wIVse2owClT7go1WT98tk", 
                "track_name": "Lose Control (feat. Ciara & Fat Man Scoop)", 
                "album_uri": "spotify:album:6vV5UrXcfyQD1wu4Qo2I9K", 
                "duration_ms": 226863, 
                "album_name": "The Cookbook"
            }, 
            ...
         ],
        "num_edits": 6, 
        "duration_ms": 11532414, 
        "num_artists": 37
     }, 
```

In [2]:
data_path = 'large_files/'

In [15]:
f_start = 0
f_end = 999
num_files = 10
counter = 0
K = 25

track_codes = {}
track_counts = {}
playlists = []
uri_input, uri_expected = [], []

for i in range(num_files) : 
    with open(data_path + '/mpd.slice.{}-{}.json'.format(f_start, f_end)) as f : 
        data = json.load(f)
        
    for playlist in data['playlists']: 
        playlist_dict = playlist.copy()
        playlist_dict.pop('tracks', None)
        
        input_, expected = [], []
        for k, song in enumerate(playlist['tracks']):
            track_name  = song['track_name']
            track_uri = song['track_uri']
            
            if track_uri not in track_counts:
                track_counts[track_uri] = 0
                
            if track_uri not in track_codes:
                track_codes[track_uri] = counter
                counter += 1
                
            track_counts[track_uri] += 1
            
            if k < K:
                input_.append(track_uri)
            else:
                expected.append(track_uri)
        
        last_song = playlist['tracks'][-1]['track_uri']
        playlist_dict['last_song'] = track_codes[last_song]
        
        playlists.append(playlist_dict)
        uri_input.append(input_)
        uri_expected.append(expected)
            
    print ("done loading file", i)             
    f_start += 1000
    f_end += 1000

done loading file 0
done loading file 1
done loading file 2
done loading file 3
done loading file 4
done loading file 5
done loading file 6
done loading file 7
done loading file 8
done loading file 9


In [4]:
playlist_df = pd.DataFrame(playlists)
playlist_df = playlist_df.drop(['description', 'collaborative'], axis=1)

enc = sklearn.preprocessing.LabelBinarizer()
last_songs = enc.fit_transform(playlist_df['last_song'])

train_df, test_df, y_train, y_test = train_test_split(playlist_df, last_songs, test_size=0.2, random_state=836)
X_train = train_df.drop(['name', 'pid', 'last_song'], axis=1)
X_test = test_df.drop(['name', 'pid', 'last_song'], axis=1)

In [5]:
uris = list(track_codes.keys())
spotify = spotipy.Spotify(auth='BQCS0mGrYXNtoVfeXJtjJC64CGcH1D_hsZrpTbVt2x4SUB5O9T978bBs20xuJSiw3lRyFWoz80sMqD1A2tQP5kOQPTMBbrsMg5tn5Wi4DnI9l_LgLdfGzmrdDC83Yiok4kuayNzrUcCC3hHzZHUVlyWCk9taB7z2eo3ssw')

keys_to_remove = ["duration_ms", "type", "id", "uri", "track_href", "analysis_url"]

start = 0
audio_features = []
while start < len(uris):
    try:
        response = spotify.audio_features(uris[start:(100+start)])
        small_response = [{key:track[key] for key in track.keys() - keys_to_remove} for track in response]
        audio_features.extend(small_response)
    except AttributeError:
        print("AttributeError")
        print("Start:", start)
    start += 100
    if start % 1000 == 0: print(start)

audio_df = pd.DataFrame(audio_features)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
AttributeError
Start: 115800
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
1

In [12]:
with open('audio_features.pickle', 'wb') as handle:
    pickle.dump(audio_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

### KMeans Clustering

#### Clustering songs together

In [19]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(uri_input, uri_expected, test_size=.2, random_state=431)

In [7]:
# Scale the features in audio_df to mean=0 and variance=1
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
audio_scaled = ss.fit_transform(audio_df)

In [22]:
# Cluster songs and build dict with list of URIs for each cluster
n_clusters = 20
km_songs = KMeans(n_clusters=n_clusters)
song_clusters = km_songs.fit_predict(audio_scaled)

cluster_to_songs, song_to_cluster = {}, {}
for i, cluster_num in enumerate(song_clusters):
    if cluster_num not in cluster_to_songs:
        cluster_to_songs[cluster_num] = []
        
    cluster_to_songs[cluster_num].append(uris[i])
    song_to_cluster[uris[i]] = cluster_num

In [23]:
class ClusterModel:
    def __init__(self, cluster_to_song, song_to_cluster, n_clusters=20, K=25):
        self.name = 'cluster_model'
        self.n_clusters = n_clusters
        self.cluster_to_song = cluster_to_song
        self.song_to_cluster = song_to_cluster
        self.K = K
        
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        predictions = []
        for playlist in X:
            clusters = [self.song_to_cluster[song] for song in playlist if song in self.song_to_cluster]
            unique, counts = np.unique(clusters, return_counts=True)
            max_cluster_id = unique[np.argmax(counts)]
            max_cluster = self.cluster_to_song[max_cluster_id]
            predicted = np.random.choice(max_cluster, size=500, replace=False)
            predictions.append(predicted)
        return predictions

In [24]:
cm = ClusterModel(cluster_to_songs, song_to_cluster)
cm.fit(X_train_c, y_train_c)
cm_output = cm.predict(X_test_c)

In [45]:
# Evaluation with the metrics
r_prec = RPrecision().evaluate(cm_output, y_test_c)
ndcg = NDCG().evaluate(cm_output, y_test_c)
rsc = RSC().evaluate(cm_output, y_test_c)
print(r_prec, ndcg, rsc)

0.00580806560947 0.00290581392781 32.437


### Keras Model (Dense)

In [None]:
hidden_size = 50
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

In [None]:
model1 = Sequential()

model1.add(Dense(hidden_size, input_dim=input_dim, activation='relu'))
model1.add(Dense(hidden_size, activation='relu'))
model1.add(Dense(output_dim, activation='linear'))

model1.compile(optimizer='adam', loss='mae', metrics=['acc'])

In [None]:
model1_history = model1.fit(X_train, y_train, batch_size=32, 
                            epochs=10, verbose=1, 
                            shuffle = True, validation_split=0.2)

In [None]:
model1.evaluate(X_test, y_test)

### Evaluation of Models

In [44]:
from math import log2

class Evaluator():
    """Superclass for evaluation functions"""
    
    def __init__(self, name):
        self.name = name
        
    def evaluate(self, output, expected):
        """
        Output will be the output of the model for some list of playlists
        - Shape of (# playlists, 500)

        Expected will be the held out songs from each playlist
        - List of lists of various sizes

        Note: Each "song" will be the unique spotify uri of a song
        """
        raise NotImplementedError

        
class RPrecision(Evaluator):
    """
    R-precision measures the number of held out songs correctly 
        retrieved by the model output 
    """
    def __init__(self):
        Evaluator.__init__(self, 'R-Precision')
        
    def evaluate(self, output, expected):
    
        def rprec_one(output_, expected_):
            expected_size = len(expected_)
            common_set = set(output_).intersection(set(expected_))
            common_size = len(common_set)
            if expected_size == 0 or common_size == 0:
                return 0.0
            return common_size / expected_size
        
        return np.mean([rprec_one(out, exp) for (out, exp) in zip(output, expected)])

    
class NDCG(Evaluator):
    """
    Normalized discounted cumulative gain also takes into 
        account how the system ordered the suggestions
    """
    def __init__(self):
        Evaluator.__init__(self, 'NDCG')
        
    def evaluate(self, output, expected):
        
        def ndcg_one(output_, expected_):
            dcg, idcg = 0.0, 0.0
            
            if len(output_) == 0 or len(expected_) == 0:
                return 0.0

            for i in range(len(output_)):
                # Prediction DCG
                if output_[i] in expected_:
                    if i == 0:
                        dcg += 1.0
                    else:
                        dcg += 1.0 / log2(i + 2.0)

                if i < len(expected_):
                    if i == 0:
                        idcg += 1.0
                    else:
                        idcg += 1.0 / log2(i + 2.0)
            
            return dcg / idcg
        
        return np.mean([ndcg_one(out, exp) for (out, exp) in zip(output, expected)])
        
        
class RSC(Evaluator):
    """
    Recommended Song Clicks measures how many times a user
    would have to click through the suggestions to find a song that 
    was a ground truth song
    """
    def __init__(self):
        Evaluator.__init__(self, 'RSC')
        
    def evaluate(self, output, expected):
        
        def rsc_one(output_, expected_):
            if len(output_) == 0 or len(expected_) == 0:
                return 0.0
            
            output_len = len(output_)
            for i in range(output_len):
                if output_[i] in expected_:
                    return i//10
            return 51
        
        return np.mean([rsc_one(out, exp) for (out, exp) in zip(output, expected)])
    