In [1]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
import sklearn
import sklearn.neighbors
from pathlib import Path
from pandas.api.types import CategoricalDtype
from operator import itemgetter
import matplotlib.pyplot as plt

from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Multiply, Reshape, Flatten, Concatenate
from keras.optimizers import Adam
from keras.regularizers import l2

import warnings
warnings.filterwarnings('ignore')

# constants
RAW_DATA_PATH = Path('raw_data/')
DATAFRAME_PATH = Path('dataframes/')
MODEL_PATH = Path('model/ncf_model.h5')
TOTAL_TRACKS = 50
NUM_WITHHELD = 25
N_NEIGHBORS = 25
SELECTED_TRACK_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']

In [2]:
# read tracks.h5 which contains tracks with features like danceability, loudness ,... etc
# set type of track_uri to category, to convert string to an int unique id
# rename track_uri to tid and sort values based on tid
tracks_features_df = pd.read_hdf(DATAFRAME_PATH / 'tracks.h5', 'tracks')
cat_type = CategoricalDtype(categories=tracks_features_df.track_uri, ordered=True)
tracks_features_df.track_uri = tracks_features_df.track_uri.astype(cat_type)
tracks_features_df = tracks_features_df.rename(columns={'track_uri':'tid'})
tracks_features_df = tracks_features_df[['tid',*SELECTED_TRACK_FEATURES]].sort_values('tid').reset_index(drop=True)
tracks_features_df.head(5)

Unnamed: 0,tid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:3uvsVUrAaGQJCTEUR1S3Sx,0.523,0.422,3,-10.385,0.027,0.783,0.00514,0.0898,0.145,129.876
1,spotify:track:0heE5tAAaDQmnGhVDImPl2,0.493,0.594,8,-4.404,0.0378,0.256,0.0,0.0759,0.304,123.751
2,spotify:track:3omXshBamrREltcf24gYDC,0.468,0.692,2,-4.015,0.0295,0.0202,2e-06,0.523,0.561,78.009
3,spotify:track:6TYWE19e35N7Bn5heHwyY6,0.553,0.564,9,-7.072,0.0418,0.039,0.0,0.318,0.31,149.953
4,spotify:track:1xznGGDReH1oQq0xzbwXa3,0.791,0.619,1,-5.886,0.0532,0.00784,0.00423,0.351,0.371,103.989


In [3]:
# read 10000 playlists from the official spoitfy 1M playlists
# for each playlist, get the tracks and their positions
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(path/file) as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                if playlist['num_tracks'] > TOTAL_TRACKS:
                    sorted_tracks = sorted(playlist['tracks'], key=itemgetter('pos')) 
                    for track in sorted_tracks[:TOTAL_TRACKS]:
                        yield track['track_uri'], playlist['pid'], track['pos'], track['artist_name'], track['track_name'],playlist['name']
def get_tracks_info(path):
    track_info = []
    playlist_tracks = []
    for row in make_playlist_dfs(path):
        track_info.append([*row[:2],*row[3:]])
        playlist_tracks.append(row[:3])
    p_tracks_df = pd.DataFrame(playlist_tracks,columns = ['tid','pid','pos'])
    p_tracks_df.tid = p_tracks_df.tid.astype(cat_type)
    t_info_df = pd.DataFrame(track_info,columns=['tid','pid','artist_name','track_name','playlist_name']).drop_duplicates(subset=['tid'])
    t_info_df.tid = t_info_df.tid.astype(cat_type)
    return p_tracks_df, t_info_df
        

playlist_tracks_df, tracks_info_df = get_tracks_info(RAW_DATA_PATH)
playlist_tracks_df

100%|██████████| 12/12 [00:02<00:00,  4.37it/s]


Unnamed: 0,tid,pid,pos
0,spotify:track:3uvsVUrAaGQJCTEUR1S3Sx,7000,0
1,spotify:track:0heE5tAAaDQmnGhVDImPl2,7000,1
2,spotify:track:3omXshBamrREltcf24gYDC,7000,2
3,spotify:track:6TYWE19e35N7Bn5heHwyY6,7000,3
4,spotify:track:1xznGGDReH1oQq0xzbwXa3,7000,4
...,...,...,...
245345,spotify:track:0WQiDwKJclirSYG9v5tayI,4998,45
245346,spotify:track:3n69hLUdIsSa1WlRmjMZlW,4998,46
245347,spotify:track:3ZMv9EzGoteNi5Qnx0KpEO,4998,47
245348,spotify:track:3AA8xNhDC0MpqwkGX3EP5V,4998,48


In [4]:
all_playlist_ids = playlist_tracks_df.pid.unique()
train_pids, test_pids = train_test_split(all_playlist_ids,random_state=0, test_size=0.20)
print(f'total playlists: {len(all_playlist_ids)}')
print(f'train playlists: {len(train_pids)}')
print(f'test playlists: {len(test_pids)}')
# 1. Get tracks that are only from the training playlists
# 2. Get the track features(danceability,loudness) for each of these training tracks
# 3. Make a test set that only includes tracks from the training set
train_playlist_tracks_df = playlist_tracks_df.query('pid in @train_pids')

train_tracks_features_df = tracks_features_df.query('tid in @train_playlist_tracks_df.tid')
test_playlist_tracks_df = playlist_tracks_df.query('pid in @test_pids and tid in @train_playlist_tracks_df.tid')
print()
print(f'total tracks in train playlists: {len(train_playlist_tracks_df)}')
print(f'unique tracks in train playlists: {len(train_tracks_features_df)}')


# 1. Get the first NUM_WITHHELD tracks for each playlist in test
# 2. Get the track features for these with held tracks
# 3. Compute mean features by grouping the tracks from incomplete playlists
test_playlist_tracks_incomplete_df = test_playlist_tracks_df.groupby('pid').head(NUM_WITHHELD)
test_tracks_incomplete_features_df = test_playlist_tracks_incomplete_df.merge(tracks_features_df,how='inner',on='tid')
test_playlist_incomplete_features = test_tracks_incomplete_features_df[['pid',*SELECTED_TRACK_FEATURES]].groupby('pid',as_index=False).mean()
print()
print(f'total tracks in test playlists: {len(test_playlist_tracks_df)}')
print(f'total tracks in incomplete test playlists: {len(test_playlist_tracks_incomplete_df)}')
print(f'total incomplete test playlists: {len(test_playlist_incomplete_features)}')

total playlists: 4907
train playlists: 3925
test playlists: 982

total tracks in train playlists: 196250
unique tracks in train playlists: 71782

total tracks in test playlists: 36534
total tracks in incomplete test playlists: 22919
total incomplete test playlists: 975


In [5]:
tracks_features_df

Unnamed: 0,tid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,spotify:track:3uvsVUrAaGQJCTEUR1S3Sx,0.523,0.422,3,-10.385,0.0270,0.78300,0.005140,0.0898,0.1450,129.876
1,spotify:track:0heE5tAAaDQmnGhVDImPl2,0.493,0.594,8,-4.404,0.0378,0.25600,0.000000,0.0759,0.3040,123.751
2,spotify:track:3omXshBamrREltcf24gYDC,0.468,0.692,2,-4.015,0.0295,0.02020,0.000002,0.5230,0.5610,78.009
3,spotify:track:6TYWE19e35N7Bn5heHwyY6,0.553,0.564,9,-7.072,0.0418,0.03900,0.000000,0.3180,0.3100,149.953
4,spotify:track:1xznGGDReH1oQq0xzbwXa3,0.791,0.619,1,-5.886,0.0532,0.00784,0.004230,0.3510,0.3710,103.989
...,...,...,...,...,...,...,...,...,...,...,...
170084,spotify:track:62FaMvOq3vLJTBjb5RgxiO,0.327,0.115,2,-13.907,0.0328,0.97800,0.000020,0.1180,0.0874,95.697
170085,spotify:track:7rWF4voHsXhZ3lx1BgtIjq,0.298,0.164,2,-14.639,0.0382,0.94700,0.306000,0.1390,0.1010,79.968
170086,spotify:track:5YUeOSawPzseEvCvi1HbrE,0.112,0.157,7,-15.906,0.0337,0.72300,0.900000,0.2120,0.0864,92.197
170087,spotify:track:4aAmDp1AR1Fp1UCpWuvy2d,0.169,0.209,4,-14.995,0.0362,0.42000,0.051900,0.0985,0.1020,78.355


In [6]:
# KNN model that will find 25 nearest neighbors to the current playlist
knn_clf = sklearn.neighbors.NearestNeighbors(n_neighbors=NUM_WITHHELD)
knn_clf.fit(train_tracks_features_df[SELECTED_TRACK_FEATURES])
distances, indices = knn_clf.kneighbors(test_playlist_incomplete_features[SELECTED_TRACK_FEATURES])

In [7]:
# for each test playlist, get the 25 next nearest predicted tracks and add them to a table for evaluation
def get_predicted_playlist_tracks():
    for index, row in test_playlist_incomplete_features.iterrows():
            predicted_tracks = train_tracks_features_df['tid'].iloc[indices[index]]
            for pos, predicted_track in enumerate(predicted_tracks):
                yield predicted_track, int(row['pid']),pos

test_predicted_playlist_tracks_df = pd.DataFrame(get_predicted_playlist_tracks(), columns =['tid', 'pid', 'pos'])
test_predicted_playlist_tracks_df.tid = test_predicted_playlist_tracks_df.tid.astype(cat_type)

In [8]:
# coode to get 
one_pid = test_playlist_incomplete_features.sample(1)
one_pid

Unnamed: 0,pid,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
390,3930,0.755737,0.734053,6.631579,-4.476789,0.078037,0.405316,5.1e-05,0.210658,0.937421,125.154158


In [9]:
tracks_in_one_playlist_df = test_predicted_playlist_tracks_df.query('pid in @one_pid.pid')
tracks_info_in_one_playlist_df = tracks_info_df.query('tid in @tracks_in_one_playlist_df.tid')
print('recommendations for ',tracks_info_df.query('pid in @one_pid.pid')['playlist_name'].head(1).values)
tracks_info_in_one_playlist_df = tracks_info_in_one_playlist_df[['artist_name','track_name']]
tracks_info_in_one_playlist_df['distances'] = distances[one_pid.index].reshape(-1,1)
tracks_info_in_one_playlist_df

recommendations for  ['Corridos']


Unnamed: 0,artist_name,track_name,distances
434,Avicii,Hey Brother,0.560394
2133,Haley Reinhart,Free,0.64041
3335,Daddy Yankee,Limbo,0.668841
3777,Atlas Genius,If So,0.669923
5949,Flo Rida,Right Round - feat. Ke$ha,0.673171
7079,David Guetta,Would I Lie To You - Club Mix,0.684014
14361,Cash Cash,Take Me Home (feat. Bebe Rexha) - Acoustic,0.711207
15981,Swedish House Mafia,Miami 2 Ibiza,0.737179
16697,Tujamo,Dr. Who! - UK Radio Edit,0.790602
17491,Pitbull,We Are One (Ole Ola) [The Official 2014 FIFA W...,0.794522


In [10]:
class Evaluator:
    def __init__(self,predicted_playlist_tracks,true_playlist_tracks):
        self.predicted_playlist_tracks = predicted_playlist_tracks
        self.true_playlist_tracks = true_playlist_tracks    

    def evaluate(self):
        predicted_playlist_tracks = self.predicted_playlist_tracks
        true_playlist_tracks = self.true_playlist_tracks
        
        r_precision_list = []
        ndcg_list = []
        song_clicks_list = []
        
        pid_list = true_playlist_tracks.pid.unique()
        
        def get_metrics():
            for pid in tqdm(pid_list):
                predictions = predicted_playlist_tracks.query('pid in @pid_list').tid
                truth = true_playlist_tracks.query('pid in @pid_list').tid
                yield (pid , self.r_precision(predictions,truth),self.ndcg(predictions,truth),self.song_clicks(predictions,truth))
        metrics = pd.DataFrame(get_metrics(),columns=['pid','r_precision','ndcg','songs_click'])
        
        return metrics[['r_precision','ndcg','songs_click']].mean()
         
    def r_precision(self,predictions,truth,n_tracks = N_NEIGHBORS):
        truth_set = set(truth)
        prediction_set = set(predictions[:n_tracks])
        intersect = prediction_set.intersection(truth_set)
        return float(len(intersect)) / len(truth_set)
            
    def ndcg(self,predictions,truth,n_tracks = N_NEIGHBORS):
        predictions = list(predictions[:n_tracks])
        truth = list(truth)   
        score = [float(element in truth) for element in predictions]    
        dcg  = np.sum(score / np.log2(1 + np.arange(1, len(score) + 1)))     
        ones = np.ones([1,len(truth)])
        idcg = np.sum(ones / np.log2(1 + np.arange(1, len(truth) + 1)))
        return (dcg / idcg)
    
    def song_clicks(self,predictions,truth,n_tracks = N_NEIGHBORS):
        predictions = predictions[:n_tracks]
        i = set(predictions).intersection(set(truth))
        for index, t in enumerate(predictions):
            for track in i:
                if t == track:
                    return float(int(index / 10))              
        return float(n_tracks / 10.0 + 1)

In [11]:
model_eval = Evaluator(test_predicted_playlist_tracks_df,test_playlist_tracks_df)
model_eval.evaluate()

100%|██████████| 975/975 [00:35<00:00, 27.21it/s]


r_precision    0.000389
ndcg           0.000985
songs_click    0.000000
dtype: float64

## Neural Network (NeuMF)

In [14]:
from scipy.sparse import dok_matrix

In [15]:
total_playlist_tracks_df = train_playlist_tracks_df.append(test_playlist_tracks_incomplete_df)
unique_tracks = total_playlist_tracks_df['tid'].unique()
total_cat_type = CategoricalDtype(categories=unique_tracks, ordered=True)
total_playlist_tracks_df['tid'] = total_playlist_tracks_df.tid.astype(total_cat_type)
total_playlist_tracks_df['cat_codes'] = total_playlist_tracks_df['tid'].cat.codes

dok_mat_n_rows = total_playlist_tracks_df.shape[0]
dok_mat_n_cols = len(unique_tracks)

dok_mat_rows = total_playlist_tracks_df['pid']
dok_mat_cols = total_playlist_tracks_df['cat_codes']

# Make a dictionary key sparse matrix
dok_mat = dok_matrix((dok_mat_n_rows, dok_mat_n_cols))

# TODO vectorize later if possible
for (pid, cat_code) in tqdm(zip(dok_mat_rows, dok_mat_cols)):
    dok_mat[pid, cat_code] = 1.0

219169it [00:02, 82154.01it/s]


In [None]:
# Hyperparameters
embedding_dim = 8
num_epochs = 2
learning_rate = 0.001
batch_size = 200
rand_negative_fill_in = 4
layer_sizes = [64, 32, 16, 8]

In [None]:
def model_factory(n_playlists, n_tracks):
    pl_input = Input(shape=(1,), dtype='int32', name='pl_input')
    tr_input = Input(shape=(1,), dtype='int32', name='tr_input')
    
    pl_mf_embedding = Embedding(input_dim=n_playlists, output_dim=embedding_dim,
                        name='pl_mf_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(0), input_length=1)
    tr_mf_embedding = Embedding(input_dim=n_tracks, output_dim=embedding_dim,
                        name='tr_mf_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(0), input_length=1)
    pl_mlp_embedding = Embedding(input_dim=n_playlists, output_dim=int(layer_sizes[0]/2),
                         name='pl_mlp_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(0), 
                         input_length=1)
    tr_mlp_embedding = Embedding(input_dim=n_tracks, output_dim=int(layer_sizes[0]/2),
                         name='tr_mlp_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(0), 
                         input_length=1)
    
    pl_mf_hidden = Flatten()(pl_mf_embedding(pl_input))
    tr_mf_hidden = Flatten()(tr_mf_embedding(tr_input))
    
    concat_mf_hidden = Multiply()([pl_mf_hidden, tr_mf_hidden])
    
    pl_mlp_hidden = Flatten()(pl_mlp_embedding(pl_input))
    tr_mlp_hidden = Flatten()(tr_mlp_embedding(tr_input))
    
    concat_mlp_hidden = Concatenate()([pl_mlp_hidden, tr_mlp_hidden])
    
    mlp = concat_mlp_hidden
    
    for i in range(1,len(layer_sizes)):
        layer = Dense(layer_sizes[i],
                      activity_regularizer=l2(0),
                      activation='relu',
                      name='layer%d' % i)
        mlp = layer(mlp)

    concat_final = Concatenate()([concat_mf_hidden, mlp])
    final_dense = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[pl_input,tr_input], outputs=final_dense(concat_final))

    return model

In [None]:
def make_train_data():
    pl_input, tr_input, recs = [], [], []
    for (pl, tr) in dok_mat.keys():
        pl_input.append(pl)
        tr_input.append(tr)
        recs.append(1.0)
        for t in range(rand_negative_fill_in):
            rand_num = np.random.randint(dok_mat.shape[1])
            while(pl, rand_num) in dok_mat.keys():
                rand_num = np.random.randint(dok_mat.shape[1])
            pl_input.append(pl)
            tr_input.append(rand_num)
            recs.append(1.0)
    return pl_input, tr_input, recs

model = model_factory(dok_mat.shape[0], dok_mat.shape[1])
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
playlist_input, track_inputs, recs = make_train_data()
# training
hist = model.fit([np.array(playlist_input), np.array(track_inputs)], np.array(recs), 
                 batch_size=batch_size, epochs=num_epochs, verbose=1, shuffle=True)
model.save(MODEL_PATH, overwrite=True)

In [17]:
model = load_model(MODEL_PATH) #🥵👌 

## Make recommendations using the NeuMF model

In [18]:
from sklearn.cluster import KMeans

playlist_for_test = np.random.choice(test_playlist_tracks_df['pid'].tolist())
ground_truth = test_playlist_tracks_df[test_playlist_tracks_df['pid'] == playlist_for_test]

pl_mlp_embedding_weights = (next(iter(filter(lambda x: x.name == 'pl_mlp_embedding', model.layers))).get_weights())[0]
playlist_embedding_weight_matrix = pl_mlp_embedding_weights
chosen_playlist_vector = playlist_embedding_weight_matrix[playlist_for_test,:]

print("Fitting a KMeans model with 100 clusters to the embedding weight matrix for playlists")
km_model = KMeans(n_clusters=100, random_state=0, verbose=0).fit(playlist_embedding_weight_matrix)

chosen_playlist_vector = np.reshape(chosen_playlist_vector, (1,32))
playlist_predictor = km_model.predict(chosen_playlist_vector)
playlist_labels = km_model.labels_

similar_playlists = []
for pid, playlist_label in enumerate(playlist_labels):
    if playlist_label == playlist_predictor:
        similar_playlists.append(pid)
print(f'other playlists in cluster: {len(similar_playlists)}')

Fitting a KMeans model with 100 clusters to the embedding weight matrix for playlists
other playlists in cluster: 2678


In [19]:
from collections import OrderedDict
tracks = []
for pid in similar_playlists:
    tracks += list(total_playlist_tracks_df[total_playlist_tracks_df['pid'] == pid]['cat_codes'])
print(f'other tracks from similar_playlists in cluster: {len(tracks)}') 

tracks = list(OrderedDict.fromkeys(tracks))

pids_pred = np.full(len(tracks), playlist_for_test, dtype='int32')
tracks_pred = np.array(tracks, dtype='int32')

results = np.ravel(model.predict([pids_pred, tracks_pred], batch_size=100, verbose=0))
print("Retrieved predictions from trained model...")

other tracks from similar_playlists in cluster: 104026
Retrieved predictions from trained model...


In [20]:
unique_playlist_tracks_df = total_playlist_tracks_df.drop_duplicates(subset=['cat_codes'])
unique_playlist_tracks_df = unique_playlist_tracks_df.sort_values(by=['cat_codes'], ascending=False)

unique_tids_by_cat_codes = np.array(unique_playlist_tracks_df['tid'])
unique_result_merged = unique_playlist_tracks_df.merge(tracks_info_df, on='tid')
other_features_by_cat_codes = np.array([(row['track_name'],row['artist_name']) for i, row in unique_result_merged.iterrows()])

def get_results_df():
    for i, probability in enumerate(tqdm(results)):
        tid = unique_tids_by_cat_codes[i]
        other_features = other_features_by_cat_codes[i]
        yield probability, tid, *other_features

results_df = pd.DataFrame(get_results_df(), index=range(len(results)), columns=['probability', 'tid','track_name', 'artist_name'])
results_df = results_df.sort_values(by=['probability'], ascending=False)

nmf_preds = results_df.head(NUM_WITHHELD)
nmf_preds

100%|██████████| 44085/44085 [00:00<00:00, 318971.00it/s]


Unnamed: 0,probability,tid,track_name,artist_name
0,1.0,spotify:track:32j6GRjz9o5GdcTfrcGTIA,Freek'n You - Radio Edit,Jodeci
29377,1.0,spotify:track:1mOYGMxuUDZjKhJVJyweIg,Uber Everywhere,Trill Sammy
29369,1.0,spotify:track:31ZKfEHJ7ccpxu8tNaFNmL,Hallelujah,MacKenzie Bourg
29370,1.0,spotify:track:5s5rL8jO1ylUXKtqDyt8L4,Rain Clouds,The Arcadian Wild
29371,1.0,spotify:track:4Fr7GdYRNHWBIG0uDAdTvp,Oceans,Gardiner Sisters
29372,1.0,spotify:track:4Ej5jSHuUcfUNselqiV1qV,We Could Be Anything (Black & White),Jay Ollero
29373,1.0,spotify:track:3rRzWgGuywjiXtDoA0OwWK,Be Still,Canyon City
29374,1.0,spotify:track:7tdNx5FzKE06nwIp3YYJsP,Two Of Us,Mike Edel
29375,1.0,spotify:track:6ey6o0CyYoRbU71ChOF6zi,Famous Flower of Manhattan,The Avett Brothers
29376,1.0,spotify:track:0jJAEoWFh5TGSqbQpQZFVt,Effort,James Spaite


In [21]:
def predictions_nmf():
    for idx,row in nmf_preds.iterrows():
        yield (row['tid'], playlist_for_test, idx)

predictions_nmf_df = pd.DataFrame(predictions_nmf(), columns=['tid','pid', 'pos'])
nmf_model_eval = Evaluator(predictions_nmf_df, ground_truth.iloc[25:])
nmf_model_eval.evaluate()

100%|██████████| 1/1 [00:00<00:00, 176.02it/s]


r_precision    0.0
ndcg           0.0
songs_click    3.5
dtype: float64

In [22]:
test_playlist_tracks_incomplete_df.query('pid == @playlist_for_test').merge(tracks_info_df, on='tid')

Unnamed: 0,tid,pid_x,pos,pid_y,artist_name,track_name,playlist_name
0,spotify:track:4WRRCuJYRq7gUKiupxVH9W,456,0,8230,Chris Stapleton,Nobody To Blame,❤️
1,spotify:track:3qtZwBkPpK2Q8fcYRHm4AX,456,1,7011,Frankie Ballard,It All Started with a Beer,country
2,spotify:track:37C5IX9aM3qoPHeuuU7ByA,456,2,8358,Jordan Gray,Stay the Night,Yes
3,spotify:track:3y1t2sEahs8idFz2tiYNPO,456,3,7010,Cole Swindell,Let Me See Ya Girl,Country
4,spotify:track:4pdoeoOQSu6DNznlfNc5FP,456,4,3524,Tim McGraw,Top Of The World,❤️
5,spotify:track:17Vry8h1fUTYg8NL6pHhem,456,6,7010,Chris Lane,Fix,Country
6,spotify:track:751wKDM4y5zurzhSxcUxqP,456,7,7010,Chase Bryant,Little Bit of You,Country
7,spotify:track:1jNNHFZmRGXZFHlil5uhei,456,9,7010,Brett Eldredge,Drunk On Your Love,Country
8,spotify:track:5kNe7PE09d6Kvw5pAsx23n,456,10,7010,Thomas Rhett,Die A Happy Man,Country
9,spotify:track:0kdiUZDcLIFw0Buo9r0Q4z,456,11,7371,LANCO,Greatest Love Story,country


In [23]:
ground_truth.merge(tracks_info_df, on='tid')

Unnamed: 0,tid,pid_x,pos,pid_y,artist_name,track_name,playlist_name
0,spotify:track:4WRRCuJYRq7gUKiupxVH9W,456,0,8230,Chris Stapleton,Nobody To Blame,❤️
1,spotify:track:3qtZwBkPpK2Q8fcYRHm4AX,456,1,7011,Frankie Ballard,It All Started with a Beer,country
2,spotify:track:37C5IX9aM3qoPHeuuU7ByA,456,2,8358,Jordan Gray,Stay the Night,Yes
3,spotify:track:3y1t2sEahs8idFz2tiYNPO,456,3,7010,Cole Swindell,Let Me See Ya Girl,Country
4,spotify:track:4pdoeoOQSu6DNznlfNc5FP,456,4,3524,Tim McGraw,Top Of The World,❤️
5,spotify:track:17Vry8h1fUTYg8NL6pHhem,456,6,7010,Chris Lane,Fix,Country
6,spotify:track:751wKDM4y5zurzhSxcUxqP,456,7,7010,Chase Bryant,Little Bit of You,Country
7,spotify:track:1jNNHFZmRGXZFHlil5uhei,456,9,7010,Brett Eldredge,Drunk On Your Love,Country
8,spotify:track:5kNe7PE09d6Kvw5pAsx23n,456,10,7010,Thomas Rhett,Die A Happy Man,Country
9,spotify:track:0kdiUZDcLIFw0Buo9r0Q4z,456,11,7371,LANCO,Greatest Love Story,country
