In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
import sklearn
import sklearn.neighbors

# constants
RAW_DATA_PATH = 'raw_data'
DATAFRAME_PATH = 'dataframes'


### Create the playlists dataframe from raw JSON data

In [2]:
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(f'{path}/{file}') as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    playlist_tracks.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                playlists.append(playlist)
    playlists_df = pd.DataFrame(playlists)
    playlist_tracks_df = pd.DataFrame(playlist_tracks, columns=['track_uri', 'pid', 'pos'])
    playlist_tracks_df.set_index('pid')
    print('Storing h5 files...')
    playlists_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
    playlist_tracks_df.to_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
    print('Stored files as h5')

make_playlist_dfs(RAW_DATA_PATH);

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.50it/s]


Storing h5 files...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['name', 'collaborative', 'description'], dtype='object')]

  pytables.to_hdf(


Stored files as h5


### Read the playlists, tracks and track_info dataframes

In [3]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/tracks.h5', 'tracks')

## Building the train and test sets

In [4]:
train_playlists, test_playlists = train_test_split(playlists_df, test_size=0.2)

# Modifying the test set
train_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(train_playlists['pid'].values)].reset_index(drop=True)
test_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(test_playlists['pid'].values)].reset_index(drop=True)

train_tracks = tracks_df[tracks_df['track_uri'].isin(train_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)
test_tracks = tracks_df[tracks_df['track_uri'].isin(test_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)

# Keep only tracks that are in the training set in the test set
def filter_playlist_tracks(all_tracks, tracks):
    track_uris = pd.unique(all_tracks['track_uri'])
    return tracks[tracks['track_uri'].isin(track_uris)].reset_index(drop=True)

test_playlist_tracks_ground_truth = filter_playlist_tracks(train_playlist_tracks, test_playlist_tracks)

In [5]:
# some assertions on the ground truth set
assert test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['track_uri'].isin(train_tracks['track_uri']) == False].empty

In [6]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those
def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    challenge_set_list = []
    new_test_set = []
    for pid in tqdm(test_playlists['pid']):
        playlist_tracks = test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(test_playlists[test_playlists['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    _test = pd.DataFrame(new_test_set, columns=list(test_playlists.columns) + ['num_of_seeds', 'num_withheld'])
    _tracks = pd.DataFrame(challenge_set_list, columns=list(test_playlist_tracks_ground_truth.columns))
    return _test, _tracks

In [12]:
test_playlists_with_seed_info, test_tracks_incomplete = build_challenge_set()
# Write challenge set as hdf
test_playlists.to_hdf(DATAFRAME_PATH + '/test_playlists.h5', 'test_playlists')
test_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_tracks_incomplete.h5', 'test_tracks_incomplete')

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:08<00:00, 225.24it/s]
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['name', 'collaborative', 'description'], dtype='object')]

  pytables.to_hdf(


In [13]:
# Load test dataframes
test_playlists = pd.read_hdf(DATAFRAME_PATH + '/test_playlists.h5', 'test_playlists')
test_tracks_incomplete = pd.read_hdf(DATAFRAME_PATH + '/test_tracks_incomplete.h5', 'test_tracks_incomplete')

In [15]:
# some assertions on the test set
assert test_playlists[test_tracks_incomplete['pid'].isin(test_playlists['pid']) == False].empty
assert test_playlists[test_tracks_incomplete['track_uri'].isin(train_tracks['track_uri']) == False].empty
assert test_tracks_incomplete.size < test_playlist_tracks_ground_truth.size

  assert test_playlists[test_tracks_incomplete['pid'].isin(test_playlists['pid']) == False].empty
  assert test_playlists[test_tracks_incomplete['track_uri'].isin(train_tracks['track_uri']) == False].empty


## Building the model

In [16]:
# hyperparameters
NUM_NEIGHBORS = 10
SELECTED_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

In [None]:
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=NUM_NEIGHBORS)
knn_clf.fit(train_tracks[SELECTED_FEATURES])
distances, indices = knn_clf.kneighbors(test_tracks_incomplete[SELECTED_FEATURES])

In [None]:
#X_test_features.loc[0]
#X_train_features.iloc[indices[0]].index
test_idx = [0, 292891, 176330, 785063, 347886]
X_train_features.reset_index().iloc[indices[1]]

In [None]:
X_challenge_set[X_challenge_set['pid'] == 106052]
#X_test_track_info_ground_truth[X_test_track_info_ground_truth['pid'] == ]

In [None]:
X_test_features.iloc[:2]

In [None]:
tracks_df = tracks_df.rename(columns={'tracks':'track_uri'})
X_challenge_track_info_features = X_challenge_track_info.merge(tracks_df, how='outer', on='track_uri').dropna()
X_challenge_track_info_features.head()

In [None]:
# function to read playlist from challenge set


list_mean = []

for _, playlist in X_challenge_set.iterrows():
    tracks_with_features_in_playlist = X_challenge_track_info_features[X_challenge_track_info_features['pid'] == playlist['pid']]
    mean = tracks_with_features_in_playlist[selected_features].mean()
    mean['pid'] = playlist['pid']
#     print(mean)
    list_mean.append(mean)
    # store for all pids
    
playlists_mean_features = pd.DataFrame(data=list_mean , columns=[*selected_features, 'pid'])

In [None]:
distances, indices = knn_clf.kneighbors(tracks_with_features_in_playlist[selected_features])

In [None]:
print(distances)
print(indices)
X_train_features.reset_index().iloc[indices[0]]

In [None]:
distances, indices = knn_clf.kneighbors(tracks_with_features_in_playlist[selected_features].mean())

In [None]:
tracks_with_features_in_playlist[selected_features].mean()

In [None]:
tracks_with_features_in_playlist

In [None]:
playlists_mean_features.info()