In [1]:
import pandas as pd
import numpy as np
import json
import os

# constants
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [None]:
# pre-load the pre-prepared all_tracks dataframe
all_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/all_tracks.h5', 'tracks')
null_tracks = all_tracks_df[all_tracks_df.isnull().any(1)]['tracks']

playlist_track_info = []

def load_playlists(path):
    for file in os.listdir("raw_data"):
        if not file.endswith(".json"):
            continue
        with open("raw_data/"+file) as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    if track['track_uri'] not in null_tracks:
                        playlist_track_info.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                yield playlist
        print(f"Added playlists from {file}")

main_playlist_df = pd.DataFrame(load_playlists(DATAFRAME_PATH + '/playlists.h5'))
playlist_track_info_df = pd.DataFrame(playlist_track_info, columns=['track_uri', 'pid', 'pos'])
playlist_track_info_df.set_index('pid')

print('Storing h5 files...')
main_playlist_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_track_info_df.to_hdf(DATAFRAME_PATH + '/playlist_track_info.h5', 'track_info')
print('Stored files as h5')

### Read the playlists, tracks and track_info dataframes

In [2]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/clean_tracks.h5', 'tracks')
track_info_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_track_info.h5', 'track_info')

## Building the train and test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(playlists_df, test_size=0.2)

In [4]:
# Modifying the test set
X_train_track_info = track_info_df[track_info_df['pid'].isin(X_train['pid'].values)]
X_test_track_info = track_info_df[track_info_df['pid'].isin(X_test['pid'].values)]

# Keep only tracks that are in the training set in the test set 
X_train_track_uris = pd.unique(X_train_track_info['track_uri'])
X_test_track_info_ground_truth = X_test_track_info[X_test_track_info['track_uri'].isin(X_train_track_uris)]

In [5]:
# some assertions on the ground truth set
assert X_test_track_info_ground_truth[X_test_track_info_ground_truth['track_uri'].isin(X_train_track_info['track_uri']) == False].empty

In [6]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those

challenge_set_list = []
new_test_set = []

def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    for pid in X_test['pid']:
        playlist_tracks = X_test_track_info_ground_truth[X_test_track_info_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(X_test[X_test['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    
build_challenge_set()

In [7]:
X_challenge = pd.DataFrame(new_test_set, columns=list(X_test.columns) + ['num_of_seeds', 'num_withheld'])
X_challenge_track_info = pd.DataFrame(challenge_set_list, columns=list(X_test_track_info_ground_truth.columns))

# Write challenge set as hdf
X_challenge.to_hdf(DATAFRAME_PATH + '/challenge_playlists.h5', 'playlists')
X_challenge_track_info.to_hdf(DATAFRAME_PATH + '/challenge_playlist_track_info.h5', 'track_info')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['name', 'collaborative', 'description'], dtype='object')]

  pytables.to_hdf(


In [8]:
# Load challenge dataframes
X_challenge_set = pd.read_hdf(DATAFRAME_PATH + '/challenge_playlists.h5', 'playlists')
X_challenge_set_track_info = pd.read_hdf(DATAFRAME_PATH + '/challenge_playlist_track_info.h5', 'track_info')

In [10]:
# some assertions on the challenge set
assert X_challenge_set[X_challenge_set_track_info['pid'].isin(X_challenge_set['pid']) == False].empty
assert X_challenge_set[X_challenge_set_track_info['track_uri'].isin(X_train_track_info['track_uri']) == False].empty
assert X_challenge_set_track_info.size < X_test_track_info_ground_truth.size

  assert X_challenge_set[X_challenge_set_track_info['pid'].isin(X_challenge_set['pid']) == False].empty
  assert X_challenge_set[X_challenge_set_track_info['track_uri'].isin(X_train_track_info['track_uri']) == False].empty


## Building the model

In [34]:
import sklearn
import sklearn.neighbors

In [36]:
# hyperparameters
num_neighbours = 10

In [44]:
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=num_neighbours)

In [None]:
selected_features = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

knn_features = X_train_track_info[selected_features]

In [72]:
print(len(X_train_track_uris))
print(len(tracks_df))
# Missing some 28 tracks? TODO
X_train_features = tracks_df[tracks_df['tracks'].isin(X_train_track_uris)]

998580
1119279


In [74]:
knn_clf.fit(X_train_features[selected_features])

NearestNeighbors(n_neighbors=10)

In [81]:
#knn_clf.n_samples_fit_
#distances, indices = nbrs.kneighbors(X_)
#X_test
print(len(X_test_track_info_ground_truth))
X_test_track_uris = pd.unique(X_test_track_info_ground_truth['track_uri'])
len(X_test_track_uris)

X_test_features = tracks_df[tracks_df['tracks'].isin(X_test_track_uris)]

3194719


In [87]:
distances, indices = knn_clf.kneighbors(X_test_features[selected_features])

(333855, 10) [[     0 292891 176330 ... 581714 239403 347886]
 [     1 984947  42402 ... 715173 888030  71906]
 [     2 969105 752148 ... 778203 419381 903070]
 ...
 [992240 949387 960771 ... 980322  11230 855147]
 [993719 537833 292970 ... 239337 848717 197747]
 [994555 895194 501835 ... 355070 736843 120775]]


In [134]:
#X_test_features.loc[0]
#X_train_features.iloc[indices[0]].index
test_idx = [0, 292891, 176330, 785063, 347886]
X_train_features.reset_index().iloc[indices[1]]

Unnamed: 0,index,tracks,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
1,1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800.0,4.0
984947,1102206,spotify:track:6fEBlVQSTYx6Ej0QhXPyb0,0.742,0.737,5.0,-3.992,1.0,0.0558,0.136,0.0,0.0908,0.896,142.99,audio_features,6fEBlVQSTYx6Ej0QhXPyb0,spotify:track:6fEBlVQSTYx6Ej0QhXPyb0,https://api.spotify.com/v1/tracks/6fEBlVQSTYx6...,https://api.spotify.com/v1/audio-analysis/6fEB...,185147.0,4.0
42402,43054,spotify:track:65QVpgBIutEZlpDCi6AH2F,0.64,0.941,5.0,-4.008,0.0,0.0893,0.0123,0.0,0.305,0.761,142.905,audio_features,65QVpgBIutEZlpDCi6AH2F,spotify:track:65QVpgBIutEZlpDCi6AH2F,https://api.spotify.com/v1/tracks/65QVpgBIutEZ...,https://api.spotify.com/v1/audio-analysis/65QV...,164880.0,4.0
367151,387126,spotify:track:5hZ2i7qmmDCzlDRtxoE5oh,0.722,0.891,5.0,-4.067,1.0,0.0529,0.204,0.00124,0.105,0.928,142.917,audio_features,5hZ2i7qmmDCzlDRtxoE5oh,spotify:track:5hZ2i7qmmDCzlDRtxoE5oh,https://api.spotify.com/v1/tracks/5hZ2i7qmmDCz...,https://api.spotify.com/v1/audio-analysis/5hZ2...,244605.0,4.0
465678,495793,spotify:track:7k6ZQrCSD98mFIDRatLyCj,0.538,0.972,5.0,-3.68,0.0,0.0398,0.000735,0.0,0.147,0.807,143.031,audio_features,7k6ZQrCSD98mFIDRatLyCj,spotify:track:7k6ZQrCSD98mFIDRatLyCj,https://api.spotify.com/v1/tracks/7k6ZQrCSD98m...,https://api.spotify.com/v1/audio-analysis/7k6Z...,208453.0,4.0
262562,273368,spotify:track:4c6gceS55MEjEXz4FI0yet,0.665,0.795,5.0,-4.222,1.0,0.0868,0.0125,1.6e-05,0.147,0.705,142.895,audio_features,4c6gceS55MEjEXz4FI0yet,spotify:track:4c6gceS55MEjEXz4FI0yet,https://api.spotify.com/v1/tracks/4c6gceS55MEj...,https://api.spotify.com/v1/audio-analysis/4c6g...,130293.0,4.0
408774,432890,spotify:track:0KP6ZRIx0ncKnzlm44S1s0,0.633,0.653,5.0,-3.966,1.0,0.0419,0.254,0.0,0.0731,0.722,143.133,audio_features,0KP6ZRIx0ncKnzlm44S1s0,spotify:track:0KP6ZRIx0ncKnzlm44S1s0,https://api.spotify.com/v1/tracks/0KP6ZRIx0ncK...,https://api.spotify.com/v1/audio-analysis/0KP6...,229893.0,4.0
715173,779830,spotify:track:4FjtXTWHozZLfxfMJLAmyA,0.591,0.733,5.0,-3.754,1.0,0.0286,0.271,0.000343,0.129,0.692,142.978,audio_features,4FjtXTWHozZLfxfMJLAmyA,spotify:track:4FjtXTWHozZLfxfMJLAmyA,https://api.spotify.com/v1/tracks/4FjtXTWHozZL...,https://api.spotify.com/v1/audio-analysis/4Fjt...,198413.0,4.0
888030,984732,spotify:track:0LFjHJnsZd1B6KPnvTmIVy,0.536,0.785,5.0,-4.08,1.0,0.0481,0.205,0.000736,0.15,0.649,143.14,audio_features,0LFjHJnsZd1B6KPnvTmIVy,spotify:track:0LFjHJnsZd1B6KPnvTmIVy,https://api.spotify.com/v1/tracks/0LFjHJnsZd1B...,https://api.spotify.com/v1/audio-analysis/0LFj...,270173.0,4.0
71906,73295,spotify:track:7hfLxeKcjVkaw2tgw5IH9Q,0.67,0.84,5.0,-3.913,0.0,0.0513,0.2,0.0,0.421,0.553,142.98,audio_features,7hfLxeKcjVkaw2tgw5IH9Q,spotify:track:7hfLxeKcjVkaw2tgw5IH9Q,https://api.spotify.com/v1/tracks/7hfLxeKcjVka...,https://api.spotify.com/v1/audio-analysis/7hfL...,232413.0,4.0


In [143]:
X_challenge_set[X_challenge_set['pid'] == 106052]
#X_test_track_info_ground_truth[X_test_track_info_ground_truth['pid'] == ]

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,duration_ms,num_artists,description,num_of_seeds,num_withheld
0,Woo,False,106052,1433808000,184,172,2,7,45496690,145,,276,276


In [133]:
X_test_features.iloc[:2]

Unnamed: 0,tracks,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864.0,4.0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800.0,4.0
