## Variational Autoencoders Testing
Trying out the feasibility of variational autoencoders for music recommender systems.

In [66]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None

import sklearn
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import *

### Data Input

Example entry in a playlist file:

```
"info": {
        "generated_on": "2017-12-03 08:41:42.057563", 
        "slice": "0-999", 
        "version": "v1"
    }, 
"playlists": [
    {
        "name": "Throwbacks", 
        "collaborative": "false", 
        "pid": 0, 
        "modified_at": 1493424000, 
        "num_tracks": 52, 
        "num_albums": 47, 
        "num_followers": 1, 
        "tracks": [
            {
                "pos": 0, 
                "artist_name": "Missy Elliott", 
                "track_uri": "spotify:track:0UaMYEvWZi0ZqiDOoHU3YI", 
                "artist_uri": "spotify:artist:2wIVse2owClT7go1WT98tk", 
                "track_name": "Lose Control (feat. Ciara & Fat Man Scoop)", 
                "album_uri": "spotify:album:6vV5UrXcfyQD1wu4Qo2I9K", 
                "duration_ms": 226863, 
                "album_name": "The Cookbook"
            }, 
            ...
         ],
        "num_edits": 6, 
        "duration_ms": 11532414, 
        "num_artists": 37
     }, 
```

In [2]:
data_path = 'large_files/'

In [22]:
f_start = 0
f_end = 999
num_files = 10

track_counts = {}
playlists = []

for i in range(num_files) : 
    with open(data_path + '/mpd.slice.{}-{}.json'.format(f_start, f_end)) as f : 
        data = json.load(f)
        
    for playlist in data['playlists'] : 
        playlist_dict = playlist.copy()
        playlist_dict.pop('tracks', None)
        
        for song in playlist['tracks'] : 
            track_name  = song['track_name']
            track_uri = song['track_uri']
            
            if track_name not in track_counts:
                track_counts[track_name] = 0
                
            track_counts[track_name] += 1
   
        playlists.append(playlist_dict)
            
    print ("done loading file", i)             
    f_start += 1000
    f_end += 1000

done loading file 0
done loading file 1
done loading file 2
done loading file 3
done loading file 4
done loading file 5
done loading file 6
done loading file 7
done loading file 8
done loading file 9


In [75]:
playlist_df = pd.DataFrame(playlists)
playlist_df = playlist_df.drop(['description', 'collaborative'], axis=1)

train_df, test_df = train_test_split(playlist_df, test_size=0.2, random_state=836)
X_train = train_df.drop(['name', 'pid'], axis=1)
X_test = train_df.drop(['name', 'pid'], axis=1)

### KMeans Clustering

In [76]:
clusters = KMeans(n_clusters=10).fit_predict(X_train)
X_train['cluster'] = clusters

display(X_train.head())

Unnamed: 0,duration_ms,modified_at,num_albums,num_artists,num_edits,num_followers,num_tracks,cluster
9753,9898463,1429488000,39,35,4,1,41,2
6528,6846312,1459555200,28,19,12,1,30,9
4290,11655830,1507680000,44,31,45,1,50,0
22,9077152,1509408000,39,37,15,2,42,0
3878,13706751,1473033600,33,11,5,5,67,9


### Word Embedding (For names)

In [89]:
names = {}
count = 0
for name in train_df['name']:
    if name not in names:
        names[name] = count
        count += 1
print(len(names))

4997


In [86]:
embed_model = Sequential()
embed_model.add(Embedding(5000, 16, input_length=1))
embed_model.compile('rmsprop', 'mse')

In [87]:
name_ids = train_df['name'].apply(lambda n: names[n])
embeddings = embed_model.predict(name_ids)
embeddings.shape

(8000, 1, 16)

### Keras Model (Dense)

In [49]:
hidden_size = 50
input_dim = 
output_dim = 100

In [14]:
model1 = Sequential()

model1.add(Dense(hidden_size, input_dim=input_dim, activation='relu'))
model1.add(Dense(hidden_size, activation='relu'))
model1.add(Dense(output_dim, activation='linear'))

model1.compile(optimizer='adam', loss='mean_absolute_error')

In [16]:
model1_history = model1.fit(X_train, y_train, batch_size=32, 
                            epochs=10, verbose=1, 
                            shuffle = True, validation_split=0.2)

NameError: name 'X_train' is not defined

### RNN/LSTM attempt

In [None]:
model = Sequential()
model.add(Embedding(output_dim, hidden_size, input_length=input_len))
model.add(LSTM(hidden_size, return_sequences=True))
#model.add(LSTM(hidden_size, return_sequences=True))
#model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(output_dim, activation='softmax')))