In [2]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import sklearn
import sklearn.neighbors
import warnings
warnings.filterwarnings('ignore')

# constants
RAW_DATA_PATH = 'raw_data'
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [3]:
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(f'{path}/{file}') as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    playlist_tracks.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                playlists.append(playlist)
    playlists_df = pd.DataFrame(playlists)
    playlist_tracks_df = pd.DataFrame(playlist_tracks, columns=['track_uri', 'pid', 'pos'])
    playlist_tracks_df.set_index('pid')
    print('Storing h5 files...')
    playlists_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
    playlist_tracks_df.to_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
    print('Stored files as h5')

make_playlist_dfs(RAW_DATA_PATH);

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00,  1.74it/s]


Storing h5 files...
Stored files as h5


### Read the playlists, tracks and track_info dataframes

In [4]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_tracks.h5', 'playlist_tracks')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/tracks.h5', 'tracks')

## Building the train and test sets

In [5]:
train_playlists, test_playlists = train_test_split(playlists_df, test_size=0.2)

# Modifying the test set
train_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(train_playlists['pid'].values)].reset_index(drop=True)
test_playlist_tracks = playlist_tracks_df[playlist_tracks_df['pid'].isin(test_playlists['pid'].values)].reset_index(drop=True)

train_tracks = tracks_df[tracks_df['track_uri'].isin(train_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)
test_tracks = tracks_df[tracks_df['track_uri'].isin(test_playlist_tracks['track_uri'].values)].drop_duplicates(subset=['track_uri'],ignore_index=True)

# Keep only tracks that are in the training set in the test set
def filter_playlist_tracks(all_tracks, tracks):
    track_uris = pd.unique(all_tracks['track_uri'])
    return tracks[tracks['track_uri'].isin(track_uris)].reset_index(drop=True)

test_playlist_tracks_ground_truth = filter_playlist_tracks(train_playlist_tracks, test_playlist_tracks)

In [6]:
# some assertions on the ground truth set
assert test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['track_uri'].isin(train_tracks['track_uri']) == False].empty

In [7]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those
def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    challenge_set_list = []
    new_test_set = []
    for pid in tqdm(test_playlists['pid']):
        playlist_tracks = test_playlist_tracks_ground_truth[test_playlist_tracks_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(test_playlists[test_playlists['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    _test = pd.DataFrame(new_test_set, columns=list(test_playlists.columns) + ['num_of_seeds', 'num_withheld'])
    _tracks = pd.DataFrame(challenge_set_list, columns=list(test_playlist_tracks_ground_truth.columns))
    return _test, _tracks

In [8]:
test_playlists_with_seed_info, test_playlist_tracks_incomplete = build_challenge_set()
test_tracks_incomplete = test_tracks[test_tracks['track_uri'].isin(test_playlist_tracks_incomplete['track_uri'])].reset_index(drop=True)

# Write challenge set as hdf
test_playlists_with_seed_info.to_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')
test_tracks_incomplete.to_hdf(DATAFRAME_PATH + '/test_tracks_incomplete.h5', 'test_tracks_incomplete')

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:11<00:00, 180.15it/s]


In [9]:
# Load test dataframes
test_playlists_with_seed_info = pd.read_hdf(DATAFRAME_PATH + '/test_playlists_with_seed_info.h5', 'test_playlists_with_seed_info')
test_playlist_tracks_incomplete = pd.read_hdf(DATAFRAME_PATH + '/test_playlist_tracks_incomplete.h5', 'test_playlist_tracks_incomplete')

In [10]:
# some assertions on the test set
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['pid'].isin(test_playlists_with_seed_info['pid']) == False].empty
assert test_playlists_with_seed_info[test_playlist_tracks_incomplete['track_uri'].isin(train_tracks['track_uri']) == False].empty
assert test_playlist_tracks_incomplete.size < test_playlist_tracks_ground_truth.size

## Building the model

In [11]:
# hyperparameters
NUM_NEIGHBORS = max(test_playlists_with_seed_info['num_withheld'])
SELECTED_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

In [12]:
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=NUM_NEIGHBORS)
knn_clf.fit(train_tracks[SELECTED_FEATURES])
distances, indices = knn_clf.kneighbors(test_tracks_incomplete[SELECTED_FEATURES])

In [13]:
test_playlist_tracks_incomplete_with_features = test_playlist_tracks_incomplete.merge(test_tracks_incomplete, how='outer', on='track_uri').dropna()
test_playlist_tracks_incomplete_with_features.head()

Unnamed: 0,track_uri,pid,pos,danceability,energy,key,loudness,mode,speechiness,acousticness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,8471,34,0.554,0.794,11,-5.417,1,0.035,0.0697,...,0.63,0.79,176.01,audio_features,6Ymvlzom4TQeoKqAWsZRD8,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,https://api.spotify.com/v1/tracks/6Ymvlzom4TQe...,https://api.spotify.com/v1/audio-analysis/6Ymv...,213827,4
1,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,4510,120,0.554,0.794,11,-5.417,1,0.035,0.0697,...,0.63,0.79,176.01,audio_features,6Ymvlzom4TQeoKqAWsZRD8,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,https://api.spotify.com/v1/tracks/6Ymvlzom4TQe...,https://api.spotify.com/v1/audio-analysis/6Ymv...,213827,4
2,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,1828,23,0.554,0.794,11,-5.417,1,0.035,0.0697,...,0.63,0.79,176.01,audio_features,6Ymvlzom4TQeoKqAWsZRD8,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,https://api.spotify.com/v1/tracks/6Ymvlzom4TQe...,https://api.spotify.com/v1/audio-analysis/6Ymv...,213827,4
3,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,9998,3,0.554,0.794,11,-5.417,1,0.035,0.0697,...,0.63,0.79,176.01,audio_features,6Ymvlzom4TQeoKqAWsZRD8,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,https://api.spotify.com/v1/tracks/6Ymvlzom4TQe...,https://api.spotify.com/v1/audio-analysis/6Ymv...,213827,4
4,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,4571,45,0.554,0.794,11,-5.417,1,0.035,0.0697,...,0.63,0.79,176.01,audio_features,6Ymvlzom4TQeoKqAWsZRD8,spotify:track:6Ymvlzom4TQeoKqAWsZRD8,https://api.spotify.com/v1/tracks/6Ymvlzom4TQe...,https://api.spotify.com/v1/audio-analysis/6Ymv...,213827,4


In [14]:
# read playlist from challenge set
def calc_mean_playlist_features():
    list_mean = []
    #X_challenge_track_info_features[['pid',*selected_features]].group_by('pid').mean()
    for _, playlist in test_playlists_with_seed_info.iterrows():
        tracks_with_features_in_playlist = test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid'] == playlist['pid']]
        mean = tracks_with_features_in_playlist[SELECTED_FEATURES].mean()
        mean['pid'] = playlist['pid']
        list_mean.append(mean) # store for all pids

    return pd.DataFrame(data=list_mean , columns=[*SELECTED_FEATURES, 'pid']).astype({'pid': int})

In [15]:
mean_playlist_features = test_playlist_tracks_incomplete_with_features[['pid',*SELECTED_FEATURES]].groupby('pid').mean().reset_index()
mean_playlist_features

Unnamed: 0,pid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2,0.709875,0.730000,4.625000,-4.944438,0.109181,0.229241,0.000052,0.155244,0.597500,115.518750
1,13,0.415200,0.379400,9.000000,-7.839800,0.040580,0.611960,0.000206,0.088660,0.322400,128.684600
2,17,0.579389,0.771389,4.666667,-5.536611,0.057969,0.135930,0.000583,0.163344,0.529333,127.628778
3,25,0.662842,0.803579,5.368421,-4.281316,0.105605,0.136782,0.111428,0.205332,0.461895,126.492000
4,29,0.717450,0.627500,5.050000,-5.903400,0.089425,0.134584,0.000382,0.119005,0.570200,112.432500
...,...,...,...,...,...,...,...,...,...,...,...
1977,9990,0.501235,0.923706,4.352941,-3.972941,0.116071,0.008339,0.000365,0.282365,0.366971,125.157824
1978,9993,0.634222,0.673500,5.611111,-6.111722,0.112689,0.211804,0.020001,0.131339,0.487278,116.146778
1979,9994,0.417176,0.603912,5.647059,-6.046059,0.036329,0.215632,0.001169,0.144547,0.283588,133.744294
1980,9995,0.645187,0.777187,4.312500,-5.401187,0.279000,0.157517,0.001185,0.229294,0.459275,122.796250


In [16]:
distances, indices = knn_clf.kneighbors(mean_playlist_features[SELECTED_FEATURES])

In [17]:
#print(distances[0])
#print(indices.shape)
#print(mean_playlist_features.loc[0])
#display(test_playlist_tracks_incomplete_with_features[test_playlist_tracks_incomplete_with_features['pid']==3])
#train_tracks.iloc[indices[0]]
#lookup = test_playlists_with_seed_info[['pid', 'num_withheld']]

#test_playlists_with_seed_info['pid'].isin(mean_playlist_features['pid'])
#display(test_playlists_with_seed_info)
test_mean_playlist_merge = mean_playlist_features.merge(test_playlists_with_seed_info, how='inner', on='pid')
def get_predicted_tracks():
    for index, row in test_mean_playlist_merge.iterrows():
        #todo maybe add distances later
        predicted_tracks = train_tracks['track_uri'].iloc[indices[index]].iloc[:row['num_withheld']]
        for pos, predicted_track in enumerate(predicted_tracks):
            yield predicted_track, row['pid'],pos
predicted_test_playlist_tracks_df = pd.DataFrame(get_predicted_tracks(), columns =['track_uri', 'pid', 'pos'])
predicted_test_playlist_tracks_df

Unnamed: 0,track_uri,pid,pos
0,spotify:track:6huUiyerxlQbOYztjSwkGj,2,0
1,spotify:track:4vlL7HTI5eVGBYOb1w7Goi,2,1
2,spotify:track:3aO3kPR7GlZnhb3vGtwytz,2,2
3,spotify:track:0MA08kUVpOjW8DoWwds7Gn,2,3
4,spotify:track:44zTVdOJfcmY8eVdO5cG8A,2,4
...,...,...,...
167050,spotify:track:4v33q9Cc7tO6be7MsiKpSQ,9998,13
167051,spotify:track:6UjnmHibyptIa3IEck0Hrd,9998,14
167052,spotify:track:4xo7Fc7BSRHYJVFC1Upde2,9998,15
167053,spotify:track:5t4tY8hfO0AjslnDyBqTNw,9998,16


In [18]:
distances, indices = knn_clf.kneighbors(tracks_with_features_in_playlist[selected_features].mean())

NameError: name 'tracks_with_features_in_playlist' is not defined

In [None]:
tracks_with_features_in_playlist[selected_features].mean()

In [None]:
tracks_with_features_in_playlist

In [None]:
playlists_mean_features.info()