In [1]:
import import_ipynb

In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import scipy.sparse as sps

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from category_encoders import TargetEncoder
from time import time

#own functions
from evaluation import DCG
from evaluation import nDCG
from evaluation import R_Precision

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

importing Jupyter notebook from evaluation.ipynb
DCG = 0.5
IDCG = 1.0
nDCG = 0.5


# Define function to look up similar items 

In [3]:
def similar_items(track_uri, top_n, similarity_matrix):
    #print('seed track is not included')
    D_rec={}
    #time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(similarity_matrix[idx:idx+1,:], similarity_matrix)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][1:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=similarity_array[0][el]
    #print(f'{time()-time0:0.2f}sec')
    return D_rec

In [4]:
def similar_items_with_description(track_uri, top_n, similarity_matrix):
    print('CB audio features - first tracks returned is the seed track')
    D_rec={}
    time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(similarity_matrix[idx:idx+1,:], similarity_matrix)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][0:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=D_desc[D_track_id_to_uri[el]]
    print(f'{time()-time0:0.2f}sec')
    return list(D_rec.values())

In [5]:
def similar_items_with_description_external(track_uri, top_n):
    print('CB audio features - first tracks returned is the seed track')
    D_rec={}
    time0 = time()
    idx = D_track_uri_to_id[track_uri]
    similarity_array = cosine_similarity(X_transformed[idx:idx+1,:], X_transformed)
    recc_tracks = np.fliplr(similarity_array.argsort())
    for el in recc_tracks[0][0:top_n+1]:
        D_rec[D_track_id_to_uri[el]]=D_desc[D_track_id_to_uri[el]]
    print(f'{time()-time0:0.2f}sec')
    return list(D_rec.values())

In [6]:
import operator

def predict_and_evaluate_top_n(pid,top_n):
    """
    return
    (1) top_n predicted track_ids
    (2) ground_truth : track_ids in the hold_out
    (3) R_Prec
    
    """
    train_array_track_ids = track_id_array[M[pid].toarray()[0].astype(bool)]
    
    D_pred={}
    
    topn_n_by_track = int(top_n/len(train_array_track_ids))*2
    
    for el in train_array_track_ids:
        D_pred.update(similar_items(D_track_id_to_uri[el],topn_n_by_track,X_transformed))
    
    D_pred = dict(sorted(D_pred.items(), key=operator.itemgetter(1),reverse=True))
    ground_truth = ev_set_arr[ev_set_arr[:,0]==pid][:,2]
    L_pred = list(D_pred.keys())[:top_n]
    R_Prec = R_Precision(L_pred[:len(ground_truth)],ground_truth)
    res = [int(el in list(ground_truth)) for el in L_pred]
    NDCG = nDCG(res)[1]
    return L_pred, ground_truth, R_Prec, NDCG, res

# Load data

In [7]:
with open('../data-processed/full-data/track_descriptions.json') as json_file:
    D_desc = json.load(json_file)
    
D_desc['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['Lose Control (feat. Ciara & Fat Man Scoop)', 'Missy Elliott', 'The Cookbook']

In [8]:
with open('../data-processed/full-data/track_artist_album.json') as json_file:
    D_album_artist = json.load(json_file)
    
D_album_artist['spotify:track:0UaMYEvWZi0ZqiDOoHU3YI']

['spotify:artist:2wIVse2owClT7go1WT98tk',
 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K']

In [9]:
path = '../data-processed/full-data/audio-features-combined.csv'
data = pd.read_csv(path)

In [10]:
data = data.reset_index()
data = data.rename(columns = {'index':'track_id'})
data['artist_uri'] = data.apply(lambda x:D_album_artist[x.uri][0], axis = 1)
data['album_uri'] = data.apply(lambda x:D_album_artist[x.uri][1], axis = 1)

In [11]:
# data.head()

In [12]:
D_track_uri_to_id = data.groupby('uri')['track_id'].min().to_dict()
D_track_id_to_uri = data.groupby('track_id')['uri'].min().to_dict()

In [13]:
len(D_track_uri_to_id), len(D_track_id_to_uri)

(2262190, 2262190)

In [14]:
data.head()

Unnamed: 0,track_id,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist_uri,album_uri
0,0,spotify:track:5OSaZxhNj307YpTh7Qp8Xy,0.716,0.421,7,-11.56,0,0.0275,0.649,0.00207,0.0936,0.207,99.078,378440,4,spotify:artist:1vSHzGHsVOCrgPSCmKNimP,spotify:album:0oA1WetYmmrxkyjxz92yJg
1,1,spotify:track:4zytmsfZ7HtdXj3VLECcu2,0.141,0.441,5,-12.893,1,0.0892,0.7,0.687,0.128,0.0725,76.044,331000,4,spotify:artist:6gHYtzSvIIWvoUY2q2V3Rj,spotify:album:3c1gpGKfahKOFXtM7eIcFO
2,2,spotify:track:6lOWUS7iBVEw6ieJqxH17B,0.799,0.893,8,-5.496,1,0.0635,1.8e-05,0.914,0.362,0.529,128.009,469397,4,spotify:artist:7kxOVclB0zQamtBR0syCrg,spotify:album:0mX83KZvGWqBOvI4nIZ23H
3,3,spotify:track:3OSL6hJ9DoRGwr9OSEVrRS,0.789,0.548,4,-7.167,1,0.0418,0.707,0.000934,0.0616,0.593,117.713,153893,4,spotify:artist:2sxmKe3CUrWnx7eoXMhOlW,spotify:album:4hXCM8vqLJnlFcuHoH3zVP
4,4,spotify:track:6x0bgGOKckFsesVf7yPWJq,0.728,0.67,3,-7.912,0,0.102,0.157,0.000797,0.108,0.0647,119.963,196000,4,spotify:artist:6PyeXqjH8OMGnt1IOhWgrQ,spotify:album:1pjNAADvPDurRS42fqxN4k


# Plot data

In [15]:
# data.iloc[:,1:-1].hist(figsize=(14,9))
# uncomment
# plt.show()

In [16]:
# fig, ax = plt.subplots(ncols=4, nrows=3, figsize=(14,9))

# column = data.columns[2:14]

#uncomment
# for i in range(3):
#     for j in range(4):
#         sns.boxplot(data=data[column[i*4+j]], palette="Set1", ax=ax[i,j]).set_title(column[i*4+j])

# Standardize data 

In [17]:
X = data.iloc[:,2:13]

In [18]:
#column orders
data.iloc[:,2:13].columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')

In [19]:
scaler = StandardScaler()
transformer = scaler.fit(X)
X_transformed = transformer.transform(X)

In [20]:
# uncomment
# pd.DataFrame(X_transformed).hist(figsize=(14,9))
# plt.show()

# Target encode album_uri and artist_uri

In [21]:
X_transformed.shape

(2262190, 11)

In [22]:
df_X_transformed = pd.DataFrame(X_transformed, columns=data.iloc[:,2:13].columns)
# df_X_transformed.head()

In [23]:
df_X_transformed.shape

(2262190, 11)

In [24]:
df_X_transformed.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype='object')

# Encode album_uri

In [25]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [26]:
df_X_transformed['album_uri'] = data.album_uri
X = data['album_uri'].values.reshape(-1,1)

In [27]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1)  
    df_X_transformed[f'album_uri_{col}_enc'] = enc.fit_transform(X, y)

In [28]:
# df_X_transformed.head()

# Encode artist_uri

In [29]:
cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [30]:
df_X_transformed['artist_uri'] = data.artist_uri
X = data['artist_uri'].values.reshape(-1,1)

In [31]:
enc = TargetEncoder()
for col in cols:
    y = df_X_transformed[col].values.reshape(-1,1) 
    df_X_transformed[f'artist_uri_{col}_enc'] = enc.fit_transform(X, y)

In [32]:
# df_X_transformed.head()

In [33]:
X_transformed = df_X_transformed.drop(columns=['artist_uri','album_uri']).to_numpy()

# Perform PCA - recommendations can be done using dimensionality reduction on using all features by passing  X_transformed or X_transformed_pca

In [34]:
X_transformed.shape

(2262190, 33)

In [35]:
pca = PCA(n_components='mle')
pca = pca.fit(X_transformed)

In [36]:
# print(pca.explained_variance_ratio_)

In [37]:
X_transformed_pca = pca.transform(X_transformed)

> Let us run PCA with 15 factors

In [38]:
pca = PCA(n_components=15)
pca = pca.fit(X_transformed)
print(pca.explained_variance_ratio_)

[0.31535805 0.11036058 0.09556642 0.0776244  0.056072   0.05212492
 0.04761578 0.04211778 0.02992701 0.02492938 0.02194849 0.01280654
 0.0125796  0.01206579 0.01069708]


In [39]:
print(f'Explained variance = {np.sum(pca.explained_variance_ratio_):0.2f}')

Explained variance = 0.92


In [40]:
X_transformed_pca = pca.transform(X_transformed)

# Find similar tracks

<img src="tenor.gif">

In [41]:
similar_items_with_description('spotify:track:3ZOEytgrvLwQaqXreDs2Jx',10,X_transformed)

CB audio features - first tracks returned is the seed track
0.95sec


[["Can't Stop", 'Red Hot Chili Peppers', 'By The Way'],
 ['Did I Let You Know', 'Red Hot Chili Peppers', "I'm With You"],
 ['Forget Me Now', 'Against The Current', 'In Our Bones'],
 ['Warlocks', 'Red Hot Chili Peppers', 'Stadium Arcadium'],
 ['Miracle', 'Ghost Beach', 'Blonde'],
 ['Halo', 'The Pass', 'Melt'],
 ['Children Of The Gun', 'Drowning Pool', 'Drowning Pool'],
 ['Vegas', 'Semi Precious Weapons', 'Aviation'],
 ["You Can't Put Your Arms Around What's Already Gone",
  'Roxette',
  'Have A Nice Day'],
 ['Lackluster', 'Caskitt', 'This Machine Kills Sadness'],
 ['Here With You', 'Sing It Loud', 'Everything Collide']]

In [42]:
# similar_items_with_description('spotify:track:3DK6m7It6Pw857FcQftMds',10,X_transformed)

In [43]:
# similar_items_with_description('spotify:track:3XplJgPz8VjbDzbGwGgZdq',15,X_transformed)

In [44]:
# similar_items_with_description('spotify:track:3jagJCUbdqhDSPuxP8cAqF',15,X_transformed)

In [45]:
# similar_items_with_description('spotify:track:79XrkTOfV1AqySNjVlygpW',15,X_transformed)

In [46]:
# similar_items_with_description('spotify:track:4X5f3vT8MRuXF68pfjNte5',15,X_transformed)

In [47]:
# similar_items_with_description('spotify:track:5fwP61Nmo5xXYIjO9XEWqb',15,X_transformed)

# Playlist recommendations

### Load 'train' data

In [48]:
file_path = '../data-processed/full-data/pid-track-pos-rating-train-data.csv'

In [49]:
data2 = pd.read_csv(file_path)

In [50]:
tracks = list(data2.track_uri.unique())
D_tracks = {}
n=0
for track in tracks:
    D_tracks[track] = n
    n+=1
    
D_tracks_reverse = {}
n=0
for k,i in D_tracks.items():
    D_tracks_reverse[i] = k
    
data2['track_id'] = data2.track_uri.map(D_tracks)

In [56]:
evaluation_set = pd.read_csv('../data-processed/full-data/evaluation-pids-ground-truth.csv')
evaluation_set['track_id'] = evaluation_set['track_uri'].map(D_tracks)
ev_set = evaluation_set[evaluation_set['hold_out'] == 1][['pid','track_id','track_uri','hold_out']]
ev_set = ev_set[ev_set.track_id.isnull()==False]
evaluation_pids = list(ev_set.pid.unique())
ev_set.track_id = ev_set.track_id.astype(int)
ev_set_arr = ev_set.to_numpy()

In [57]:
M = sps.csr_matrix((data2.pos_rating, (data2.pid, data2.track_id)))

In [58]:
track_id_array = np.arange(M.shape[1])

In [None]:
n=1000
time0=time()
epid = random.sample(evaluation_pids,n)
evaluation_result = {}
for i,pid in enumerate(epid):
    evaluation = predict_and_evaluate_top_n(pid,500)
    evaluation_result[pid] = [evaluation[2],evaluation[3]]
    if i%10==1:
        print(time() - time0)
df = pd.DataFrame(evaluation_result).transpose().reset_index()
df.columns=['pid','R-Precision','nDCG']
df['rating'] = 'na'
model = f'CB_audio_features'
df['model'] = model
df.to_csv(f'../evaluation/{model}_{n}.csv', index = None)

print(time() - time0)

In [None]:
df.secribe()