## Variational Autoencoders Testing
Trying out the feasibility of variational autoencoders for music recommender systems.

In [125]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import *

### Data Input

Example entry in a playlist file:

```
"info": {
        "generated_on": "2017-12-03 08:41:42.057563", 
        "slice": "0-999", 
        "version": "v1"
    }, 
"playlists": [
    {
        "name": "Throwbacks", 
        "collaborative": "false", 
        "pid": 0, 
        "modified_at": 1493424000, 
        "num_tracks": 52, 
        "num_albums": 47, 
        "num_followers": 1, 
        "tracks": [
            {
                "pos": 0, 
                "artist_name": "Missy Elliott", 
                "track_uri": "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI", 
                "artist_uri": "spotify:artist:2wIVse2owClT7go1WT98tk", 
                "track_name": "Lose Control (feat. Ciara & Fat Man Scoop)", 
                "album_uri": "spotify:album:6vV5UrXcfyQD1wu4Qo2I9K", 
                "duration_ms": 226863, 
                "album_name": "The Cookbook"
            }, 
            ...
         ],
        "num_edits": 6, 
        "duration_ms": 11532414, 
        "num_artists": 37
     }, 
```

In [126]:
data_path = 'large_files/'

In [127]:
f_start = 0
f_end = 999
num_files = 10
counter = 0

track_codes = {}
track_counts = {}
playlists = []

for i in range(num_files) : 
    with open(data_path + '/mpd.slice.{}-{}.json'.format(f_start, f_end)) as f : 
        data = json.load(f)
        
    for playlist in data['playlists'] : 
        playlist_dict = playlist.copy()
        playlist_dict.pop('tracks', None)
        
        for song in playlist['tracks'] : 
            track_name  = song['track_name']
            track_uri = song['track_uri']
            
            if track_name not in track_counts:
                track_counts[track_name] = 0
                
            if track_name not in track_codes:
                track_codes[track_name] = counter
                counter += 1
                
            track_counts[track_name] += 1
        
        last_song = playlist['tracks'][-1]['track_name']
        playlist_dict['last_song'] = track_codes[last_song]
#         playlist_dict['last_song'] = last_song
        
        playlists.append(playlist_dict)
            
    print ("done loading file", i)             
    f_start += 1000
    f_end += 1000

done loading file 0
done loading file 1
done loading file 2
done loading file 3
done loading file 4
done loading file 5
done loading file 6
done loading file 7
done loading file 8
done loading file 9


In [152]:
playlist_df = pd.DataFrame(playlists)
playlist_df = playlist_df.drop(['description', 'collaborative'], axis=1)

enc = sklearn.preprocessing.LabelBinarizer()
last_songs = enc.fit_transform(playlist_df['last_song'])

train_df, test_df, y_train, y_test = train_test_split(playlist_df, last_songs, test_size=0.2, random_state=836)
X_train = train_df.drop(['name', 'pid', 'last_song'], axis=1)
X_test = test_df.drop(['name', 'pid', 'last_song'], axis=1)

### KMeans Clustering

In [153]:
km = KMeans(n_clusters=10)
clusters =km.fit_predict(X_train)
X_train['cluster'] = clusters
X_test['cluster'] = km.predict(X_test)

display(X_train.head())

Unnamed: 0,duration_ms,modified_at,num_albums,num_artists,num_edits,num_followers,num_tracks,cluster
9753,9898463,1429488000,39,35,4,1,41,1
6528,6846312,1459555200,28,19,12,1,30,3
4290,11655830,1507680000,44,31,45,1,50,4
22,9077152,1509408000,39,37,15,2,42,4
3878,13706751,1473033600,33,11,5,5,67,3


### Word Embedding (For names)

In [None]:
names = {}
count = 0
for name in train_df['name']:
    if name not in names:
        names[name] = count
        count += 1
print(len(names))

In [None]:
embed_model = Sequential()
embed_model.add(Embedding(5000, 16, input_length=1))
embed_model.compile('rmsprop', 'mse')

In [None]:
name_ids = train_df['name'].apply(lambda n: names[n])
embeddings = embed_model.predict(name_ids)
embeddings.shape

### Keras Model (Dense)

In [154]:
hidden_size = 50
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

In [169]:
model1 = Sequential()

model1.add(Dense(hidden_size, input_dim=input_dim, activation='relu'))
model1.add(Dense(hidden_size, activation='relu'))
model1.add(Dense(output_dim, activation='softmax'))

model1.compile(optimizer='adam', loss='mae', metrics=['acc'])

TypeError: softmax() got an unexpected keyword argument 'axis'

In [162]:
model1_history = model1.fit(X_train, y_train, batch_size=32, 
                            epochs=10, verbose=1, 
                            shuffle = True, validation_split=0.2)

Train on 6400 samples, validate on 1600 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [163]:
model1.evaluate(X_test, y_test)



[0.00024550651805475355, 0.0]

### RNN/LSTM attempt

In [None]:
model = Sequential()
model.add(Embedding(output_dim, hidden_size, input_length=input_len))
model.add(LSTM(hidden_size, return_sequences=True))
#model.add(LSTM(hidden_size, return_sequences=True))
#model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(output_dim, activation='softmax')))

### Evaluation of Models

In [172]:
from math import log2

class Evaluator():
    """Superclass for evaluation functions"""
    
    def __init__(self, name):
        self.name = name
        
    def evaluate(self, output, expected):
        """
        Output will be the output of the model for some list of playlists
        - Shape of (# playlists, 500)

        Expected will be the held out songs from each playlist
        - List of lists of various sizes

        Note: Each "song" will be the unique spotify uri of a song
        """
        raise NotImplementedError

        
def RPrecision(Evaluator):
    """
    R-precision measures the number of held out songs correctly 
        retrieved by the model output 
    """
    def __init__(self):
        Evaluator.__init__(self, 'R-Precision')
        
    def evaluate(self, output, expected):
    
        def rprec_one(output_, expected_):
            expected_size = len(expected_)
            common_set = set(output_).intersection(set(expected_))
            common_size = len(common_set)
            return common_size / expected_size
        
        return np.mean([rprec_one(out, exp) for (out, exp) in zip(output, expected)])

    
def NDCG(Evaluator):
    """
    Normalized discounted cumulative gain also takes into 
        account how the system ordered the suggestions
    """
    def __init__(self):
        Evaluator.__init__(self, 'NDCG')
        
    def evaluate(self, output, expected):
        
        def ndcg_one(output_, expected_):
            dcg, idcg = 0.0, 0.0

            for i in range(len(output_)):
                # Prediction DCG
                if output_[i] in expected_:
                    if i == 0:
                        dcg += 1.0
                    else:
                        dcg += 1.0 / log2(i + 2.0)

                if i < len(expected_):
                    if i == 0:
                        idcg += 1.0
                    else:
                        idcg += 1.0 / log(i + 2.0)

            return dcg / idcg
        
        return np.mean([ndcg_one(out, exp) for (out, exp) in zip(output, expected)])
        
        
def RSC(Evaluator):
    """
    Recommended Song Clicks measures how many times a user
    would have to click through the suggestions to find a song that 
    was a ground truth song
    """
    def __init__():
        Evaluator.__init__(self, 'RSC')
        
    def evaluate(self, output, expected):
        
        def rsc_one(output_, expected_):   
            output_len = len(output)
            for i in range(output_len):
                if output[i] in expected:
                    return i//10
            return 51
        
        return np.mean([rsc_one(out, exp) for (out, exp) in zip(output, expected)])
    