In [1]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import sklearn
import sklearn.neighbors
import warnings
warnings.filterwarnings('ignore')

# constants
RAW_DATA_PATH = 'raw_data'
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [2]:
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(f'{path}/{file}') as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    playlist_tracks.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                playlists.append(playlist)
    playlists_df = pd.DataFrame(playlists)
    playlist_tracks_df = pd.DataFrame(playlist_tracks, columns=['track_uri', 'pid', 'pos'])
    playlist_tracks_df.set_index('pid')
    print('Storing h5 files...')
    playlists_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
    playlist_tracks_df.to_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
    print('Stored files as h5')

make_playlist_dfs(RAW_DATA_PATH);

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.05it/s]


Storing h5 files...
Stored files as h5


### Read the playlists, tracks and track_info dataframes

In [3]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/tracks.h5', 'tracks')

## Building the train and test sets

In [4]:
train_playlists, test_playlists = train_test_split(playlists_df, test_size=0.2)

# Modifying the test set
train_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(train_playlists['pid'].values)].reset_index(drop=True)
test_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(test_playlists['pid'].values)].reset_index(drop=True)

train_tracks = tracks_df[tracks_df['track_uri'].isin(train_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)
test_tracks = tracks_df[tracks_df['track_uri'].isin(test_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)

# Keep only tracks that are in the training set in the test set
def filter_playlist_tracks(all_tracks, tracks):
    track_uris = pd.unique(all_tracks['track_uri'])
    return tracks[tracks['track_uri'].isin(track_uris)].reset_index(drop=True)

test_playlist_tracks_ground_truth = filter_playlist_tracks(train_playlist_tracks, test_playlist_tracks)

In [5]:
# some assertions on the ground truth set
assert test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['track_uri'].isin(train_tracks['track_uri']) == False].empty

In [6]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those
def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    challenge_set_list = []
    new_test_set = []
    for pid in tqdm(test_playlists['pid']):
        playlist_tracks = test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(test_playlists[test_playlists['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    _test = pd.DataFrame(new_test_set, columns=list(test_playlists.columns) + ['num_of_seeds', 'num_withheld'])
    _tracks = pd.DataFrame(challenge_set_list, columns=list(test_playlist_tracks_ground_truth.columns))
    return _test, _tracks

In [7]:
test_playlists_with_seed_info, test_playlist_tracks_incomplete = build_challenge_set()
test_tracks_incomplete = test_tracks[test_tracks['track_uri'].isin(test_playlist_tracks_incomplete['track_uri'])].reset_index(drop=True)

# Write challenge set as hdf
test_playlists_with_seed_info.to_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')
test_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_tracks_incomplete.h5', 'test_tracks_incomplete')

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:10<00:00, 187.32it/s]


In [8]:
# Load test dataframes
test_playlists_with_seed_info = pd.read_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete = pd.read_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')

In [9]:
# some assertions on the test set
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['pid'].isin(test_playlists_with_seed_info['pid']) == False].empty
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['track_uri'].isin(train_tracks['track_uri']) == False].empty
assert test_playlist_tracks_incomplete.size < test_playlist_tracks_ground_truth.size

## Building the model

In [10]:
# hyperparameters
NUM_NEIGHBORS = max(test_playlists_with_seed_info['num_withheld'])
SELECTED_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

In [14]:
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=NUM_NEIGHBORS)
knn_clf.fit(train_tracks[SELECTED_FEATURES])
distances, indices = knn_clf.kneighbors(test_tracks_incomplete[SELECTED_FEATURES])

In [19]:
test_playlist_tracks_incomplete_with_features = test_playlist_tracks_incomplete.merge(test_tracks_incomplete, how='outer', on='track_uri').dropna()
test_playlist_tracks_incomplete_with_features.head()

Unnamed: 0,track_uri,pid,pos,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,spotify:track:6zsk6uF3MxfIeHPlubKBvR,5644,68,0.797,0.75,0,-9.369,1,0.247,0.533,...,0.095,0.74,177.87,audio_features,6zsk6uF3MxfIeHPlubKBvR,spotify:track:6zsk6uF3MxfIeHPlubKBvR,https://api.spotify.com/v1/tracks/6zsk6uF3MxfI...,https://api.spotify.com/v1/audio-analysis/6zsk...,211120,4
1,spotify:track:6zsk6uF3MxfIeHPlubKBvR,5558,63,0.797,0.75,0,-9.369,1,0.247,0.533,...,0.095,0.74,177.87,audio_features,6zsk6uF3MxfIeHPlubKBvR,spotify:track:6zsk6uF3MxfIeHPlubKBvR,https://api.spotify.com/v1/tracks/6zsk6uF3MxfI...,https://api.spotify.com/v1/audio-analysis/6zsk...,211120,4
2,spotify:track:6zsk6uF3MxfIeHPlubKBvR,844,142,0.797,0.75,0,-9.369,1,0.247,0.533,...,0.095,0.74,177.87,audio_features,6zsk6uF3MxfIeHPlubKBvR,spotify:track:6zsk6uF3MxfIeHPlubKBvR,https://api.spotify.com/v1/tracks/6zsk6uF3MxfI...,https://api.spotify.com/v1/audio-analysis/6zsk...,211120,4
3,spotify:track:6zsk6uF3MxfIeHPlubKBvR,342,47,0.797,0.75,0,-9.369,1,0.247,0.533,...,0.095,0.74,177.87,audio_features,6zsk6uF3MxfIeHPlubKBvR,spotify:track:6zsk6uF3MxfIeHPlubKBvR,https://api.spotify.com/v1/tracks/6zsk6uF3MxfI...,https://api.spotify.com/v1/audio-analysis/6zsk...,211120,4
4,spotify:track:6zsk6uF3MxfIeHPlubKBvR,9576,17,0.797,0.75,0,-9.369,1,0.247,0.533,...,0.095,0.74,177.87,audio_features,6zsk6uF3MxfIeHPlubKBvR,spotify:track:6zsk6uF3MxfIeHPlubKBvR,https://api.spotify.com/v1/tracks/6zsk6uF3MxfI...,https://api.spotify.com/v1/audio-analysis/6zsk...,211120,4


In [35]:
# read playlist from challenge set
def calc_mean_playlist_features():
    list_mean = []
    #X_challenge_track_info_features[['pid',*selected_features]].group_by('pid').mean()
    for _, playlist in test_playlists_with_seed_info.iterrows():
        tracks_with_features_in_playlist = test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid'] == playlist['pid']]
        mean = tracks_with_features_in_playlist[SELECTED_FEATURES].mean()
        mean['pid'] = playlist['pid']
        list_mean.append(mean) # store for all pids

    return pd.DataFrame(data=list_mean , columns=[*SELECTED_FEATURES, 'pid']).astype({'pid': int})

In [58]:
mean_playlist_features = test_playlist_tracks_incomplete_with_features[['pid',*SELECTED_FEATURES]].groupby('pid').mean().reset_index()
mean_playlist_features

Unnamed: 0,pid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,3,0.581037,0.645333,4.111111,-8.720815,0.065704,0.275367,0.124497,0.170807,0.573337,117.653926
1,11,0.617848,0.594696,5.456522,-8.329609,0.084787,0.260596,0.243333,0.177602,0.453624,123.320587
2,19,0.479148,0.510593,4.888889,-9.603852,0.033681,0.511817,0.119397,0.160241,0.455104,117.400889
3,33,0.484500,0.598500,6.500000,-8.976167,0.051400,0.514167,0.002115,0.196167,0.695333,114.480167
4,41,0.563311,0.626511,5.411111,-6.231756,0.042593,0.230737,0.000277,0.162614,0.461698,120.524789
...,...,...,...,...,...,...,...,...,...,...,...
1978,9960,0.684654,0.701192,4.961538,-5.615000,0.094665,0.070044,0.000041,0.175696,0.461769,130.218038
1979,9970,0.695000,0.564571,4.785714,-10.693929,0.071471,0.385321,0.142991,0.143671,0.712857,98.089786
1980,9978,0.530700,0.336192,4.920000,-12.233380,0.047510,0.648688,0.111851,0.129834,0.315592,121.374860
1981,9980,0.707556,0.627778,6.148148,-6.474296,0.158470,0.110569,0.000263,0.209619,0.456937,115.237111


In [59]:
distances, indices = knn_clf.kneighbors(mean_playlist_features[SELECTED_FEATURES])

In [123]:
#print(distances[0])
#print(indices.shape)
#print(mean_playlist_features.loc[0])
#display(test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid']==3])
#train_tracks.iloc[indices[0]]
#lookup = test_playlists_with_seed_info[['pid', 'num_withheld']]

#test_playlists_with_seed_info['pid'].isin(mean_playlist_features['pid'])
#display(test_playlists_with_seed_info)
something = mean_playlist_features.merge(test_playlists_with_seed_info, how='inner', on='pid')
sup = []
i=0
for index, row in something.iterrows():
    #todo maybe add distances later
    thing = train_tracks['track_uri'].iloc[indices[index]].iloc[:row['num_withheld']]
    sup.append(thing.values)
    i+=1
    
    if i==2:
        break

pd.DataFrame(sup) 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,125,126,127,128,129,130,131,132,133,134
0,spotify:track:0iXOe8tegtl30gIZfMNYJm,spotify:track:0ueePVrHQcNzHvKoqZk0If,spotify:track:11u7702Enw75RstlLzMIiu,spotify:track:1MboRagxib1PjprlE2iSyk,spotify:track:2425UlXvkIkAEJAtAd8ROm,spotify:track:0wEJG9mNTaexgrOt3Nh7QZ,spotify:track:70PR7KIOySkZ7KI5I8MM4n,spotify:track:504Ip38tLo9RbbSPIuThPB,spotify:track:5DSr1NMci58MEgS20vivSL,spotify:track:3ductzlSZ2xDOFpcJAK3gG,...,,,,,,,,,,
1,spotify:track:3g3LzMTxOoEZavij70rabA,spotify:track:3dP0pLbg9OfVwssDjp9aT0,spotify:track:3TXakfjtRwGXdQ6P3Lpi6t,spotify:track:5QiLLm0cS987RmgEA2FiF5,spotify:track:4VXwma08ch2u58EbcG1qlQ,spotify:track:4tCokCmOWOtKM2YxPCW1yd,spotify:track:37sNZIpr9A4meOYzXPnxxn,spotify:track:1bdXMstfxFWYSkEFTnJMoN,spotify:track:32eBMN4ScbkNMTIbidX81s,spotify:track:1B7rsWK4bD7UYhAVQq6Wbq,...,spotify:track:1kMOJBmpfWD9fir6st9uqS,spotify:track:7cE3gxNuHlSyY4g4ngpd13,spotify:track:04F1slSLfciNXAPZgdXtmW,spotify:track:3nkhgDddggi8CQZfINxrq9,spotify:track:3VT4cd1McNmex0VRTnU8vr,spotify:track:1KEjzrDvdFH0H0xxOjJqHl,spotify:track:6LlD5wT5aaVc87PzuuGRyq,spotify:track:1KkOD1eTIZfLG90kUdt2jh,spotify:track:69DK6q7i8W4gMd9CvuCvML,spotify:track:6RS9q1LmNfYfjifXUZfcFD


In [None]:
distances, indices = knn_clf.kneighbors(tracks_with_features_in_playlist[selected_features].mean())

In [None]:
tracks_with_features_in_playlist[selected_features].mean()

In [None]:
tracks_with_features_in_playlist

In [None]:
playlists_mean_features.info()