In [1]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import sklearn
import sklearn.neighbors
import warnings
warnings.filterwarnings('ignore')

# constants
RAW_DATA_PATH = 'raw_data'
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [2]:
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(f'{path}/{file}') as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    playlist_tracks.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                playlists.append(playlist)
    playlists_df = pd.DataFrame(playlists)
    playlist_tracks_df = pd.DataFrame(playlist_tracks, columns=['track_uri', 'pid', 'pos'])
    playlist_tracks_df.set_index('pid')
    print('Storing h5 files...')
    playlists_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
    playlist_tracks_df.to_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
    print('Stored files as h5')

make_playlist_dfs(RAW_DATA_PATH);

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.49it/s]


Storing h5 files...
Stored files as h5


### Read the playlists, tracks and track_info dataframes

In [3]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/tracks.h5', 'tracks')

## Building the train and test sets

In [4]:
train_playlists, test_playlists = train_test_split(playlists_df, test_size=0.2)

# Modifying the test set
train_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(train_playlists['pid'].values)].reset_index(drop=True)
test_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(test_playlists['pid'].values)].reset_index(drop=True)

train_tracks = tracks_df[tracks_df['track_uri'].isin(train_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)
test_tracks = tracks_df[tracks_df['track_uri'].isin(test_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)

# Keep only tracks that are in the training set in the test set
def filter_playlist_tracks(all_tracks, tracks):
    track_uris = pd.unique(all_tracks['track_uri'])
    return tracks[tracks['track_uri'].isin(track_uris)].reset_index(drop=True)

test_playlist_tracks_ground_truth = filter_playlist_tracks(train_playlist_tracks, test_playlist_tracks)

In [5]:
# some assertions on the ground truth set
assert test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['track_uri'].isin(train_tracks['track_uri']) == False].empty

In [6]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those
def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    challenge_set_list = []
    new_test_set = []
    for pid in tqdm(test_playlists['pid']):
        playlist_tracks = test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(test_playlists[test_playlists['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    _test = pd.DataFrame(new_test_set, columns=list(test_playlists.columns) + ['num_of_seeds', 'num_withheld'])
    _tracks = pd.DataFrame(challenge_set_list, columns=list(test_playlist_tracks_ground_truth.columns))
    return _test, _tracks

In [7]:
test_playlists_with_seed_info, test_playlist_tracks_incomplete = build_challenge_set()
test_tracks_incomplete = test_tracks[test_tracks['track_uri'].isin(test_playlist_tracks_incomplete['track_uri'])].reset_index(drop=True)

# Write challenge set as hdf
test_playlists_with_seed_info.to_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')
test_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_tracks_incomplete.h5', 'test_tracks_incomplete')

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:08<00:00, 227.69it/s]


In [8]:
# Load test dataframes
test_playlists_with_seed_info = pd.read_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete = pd.read_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')

In [9]:
# some assertions on the test set
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['pid'].isin(test_playlists_with_seed_info['pid']) == False].empty
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['track_uri'].isin(train_tracks['track_uri']) == False].empty
assert test_playlist_tracks_incomplete.size < test_playlist_tracks_ground_truth.size

## Building the model

In [10]:
# hyperparameters
NUM_NEIGHBORS = max(test_playlists_with_seed_info['num_withheld'])
SELECTED_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

In [11]:
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=NUM_NEIGHBORS)
knn_clf.fit(train_tracks[SELECTED_FEATURES])
distances, indices = knn_clf.kneighbors(test_tracks_incomplete[SELECTED_FEATURES])

In [12]:
test_playlist_tracks_incomplete_with_features = test_playlist_tracks_incomplete.merge(test_tracks_incomplete, how='outer', on='track_uri').dropna()
test_playlist_tracks_incomplete_with_features.head()

Unnamed: 0,track_uri,pid,pos,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,spotify:track:3cfOd4CMv2snFaKAnMdnvK,3644,63,0.731,0.867,11,-5.881,1,0.032,0.0395,...,0.0861,0.776,104.019,audio_features,3cfOd4CMv2snFaKAnMdnvK,spotify:track:3cfOd4CMv2snFaKAnMdnvK,https://api.spotify.com/v1/tracks/3cfOd4CMv2sn...,https://api.spotify.com/v1/audio-analysis/3cfO...,200373,4
1,spotify:track:3cfOd4CMv2snFaKAnMdnvK,7115,104,0.731,0.867,11,-5.881,1,0.032,0.0395,...,0.0861,0.776,104.019,audio_features,3cfOd4CMv2snFaKAnMdnvK,spotify:track:3cfOd4CMv2snFaKAnMdnvK,https://api.spotify.com/v1/tracks/3cfOd4CMv2sn...,https://api.spotify.com/v1/audio-analysis/3cfO...,200373,4
2,spotify:track:3cfOd4CMv2snFaKAnMdnvK,9609,23,0.731,0.867,11,-5.881,1,0.032,0.0395,...,0.0861,0.776,104.019,audio_features,3cfOd4CMv2snFaKAnMdnvK,spotify:track:3cfOd4CMv2snFaKAnMdnvK,https://api.spotify.com/v1/tracks/3cfOd4CMv2sn...,https://api.spotify.com/v1/audio-analysis/3cfO...,200373,4
3,spotify:track:3cfOd4CMv2snFaKAnMdnvK,9665,38,0.731,0.867,11,-5.881,1,0.032,0.0395,...,0.0861,0.776,104.019,audio_features,3cfOd4CMv2snFaKAnMdnvK,spotify:track:3cfOd4CMv2snFaKAnMdnvK,https://api.spotify.com/v1/tracks/3cfOd4CMv2sn...,https://api.spotify.com/v1/audio-analysis/3cfO...,200373,4
4,spotify:track:3cfOd4CMv2snFaKAnMdnvK,3391,48,0.731,0.867,11,-5.881,1,0.032,0.0395,...,0.0861,0.776,104.019,audio_features,3cfOd4CMv2snFaKAnMdnvK,spotify:track:3cfOd4CMv2snFaKAnMdnvK,https://api.spotify.com/v1/tracks/3cfOd4CMv2sn...,https://api.spotify.com/v1/audio-analysis/3cfO...,200373,4


In [13]:
# read playlist from challenge set
def calc_mean_playlist_features():
    list_mean = []
    #X_challenge_track_info_features[['pid',*selected_features]].group_by('pid').mean()
    for _, playlist in test_playlists_with_seed_info.iterrows():
        tracks_with_features_in_playlist = test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid'] == playlist['pid']]
        mean = tracks_with_features_in_playlist[SELECTED_FEATURES].mean()
        mean['pid'] = playlist['pid']
        list_mean.append(mean) # store for all pids

    return pd.DataFrame(data=list_mean , columns=[*SELECTED_FEATURES, 'pid']).astype({'pid': int})

In [14]:
mean_playlist_features = test_playlist_tracks_incomplete_with_features[['pid',*SELECTED_FEATURES]].groupby('pid').mean().reset_index()
mean_playlist_features

Unnamed: 0,pid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,7,0.654400,0.610000,3.933333,-7.210133,0.104107,0.358913,0.035608,0.106713,0.546267,128.114333
1,8,0.546955,0.692545,4.772727,-5.963318,0.044318,0.183301,0.027980,0.163373,0.562864,125.180864
2,9,0.622875,0.642500,5.625000,-7.329625,0.065475,0.288013,0.001783,0.205550,0.556125,115.524375
3,12,0.608000,0.666250,4.250000,-7.345750,0.072200,0.286125,0.000051,0.119475,0.595750,138.397500
4,13,0.504167,0.486333,7.000000,-7.126667,0.046650,0.484300,0.000176,0.134917,0.327833,135.033000
...,...,...,...,...,...,...,...,...,...,...,...
1981,9984,0.565346,0.450346,4.576923,-9.328038,0.053262,0.535023,0.002838,0.157385,0.357619,122.219154
1982,9991,0.407260,0.831600,3.700000,-5.928300,0.052650,0.023999,0.161257,0.184050,0.538800,119.667600
1983,9992,0.561227,0.652409,5.681818,-6.749864,0.061923,0.172514,0.064375,0.126218,0.504091,133.686455
1984,9997,0.515667,0.930000,3.666667,-5.271500,0.101367,0.143809,0.000000,0.162100,0.665500,142.275167


In [15]:
distances, indices = knn_clf.kneighbors(mean_playlist_features[SELECTED_FEATURES])

In [43]:
#print(distances[0])
#print(indices.shape)
#print(mean_playlist_features.loc[0])
#display(test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid']==3])
#train_tracks.iloc[indices[0]]
#lookup = test_playlists_with_seed_info[['pid', 'num_withheld']]

#test_playlists_with_seed_info['pid'].isin(mean_playlist_features['pid'])
#display(test_playlists_with_seed_info)
test_mean_playlist_merge = mean_playlist_features.merge(test_playlists_with_seed_info, how='inner', on='pid')
def get_predicted_tracks():
    for index, row in test_mean_playlist_merge.iterrows():
        #todo maybe add distances later
        predicted_tracks = train_tracks['track_uri'].iloc[indices[index]].iloc[:row['num_withheld']]
        for pos, predicted_track in enumerate(predicted_tracks):
            yield predicted_track, row['pid'],pos
predicted_test_playlist_tracks_df = pd.DataFrame(get_predicted_tracks(), columns =['track_uri', 'pid', 'pos'])
predicted_test_playlist_tracks_df

Unnamed: 0,track_uri,pid,pos
0,spotify:track:3M5RtAgwNqvcvOE3JiDd1R,7,0
1,spotify:track:1KTZw6NMt0rSIPYKYQNEWd,7,1
2,spotify:track:3mTL7NOIjGuSlrOTQUm48k,7,2
3,spotify:track:0jExzGwfnZ1vTuCVB7w3vM,7,3
4,spotify:track:2mAbcNoNnqX10r0rO9RWQ2,7,4
...,...,...,...
161347,spotify:track:4v33q9Cc7tO6be7MsiKpSQ,9998,13
161348,spotify:track:6UjnmHibyptIa3IEck0Hrd,9998,14
161349,spotify:track:4xo7Fc7BSRHYJVFC1Upde2,9998,15
161350,spotify:track:5t4tY8hfO0AjslnDyBqTNw,9998,16


In [None]:
distances, indices = knn_clf.kneighbors(tracks_with_features_in_playlist[selected_features].mean())

In [None]:
tracks_with_features_in_playlist[selected_features].mean()

In [None]:
tracks_with_features_in_playlist

In [None]:
playlists_mean_features.info()