# IV- Mise en place d'un système de recommandation

## Imports

In [20]:
import pandas as pd
import numpy as np
import pickle 

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
DATA_DIR = '../data/'
DATA_PROCESSED_DIR = DATA_DIR + 'processed/'

## Partie 1 - Load data

In [4]:
def load_pkl(pkl_file):
    with open(pkl_file, 'rb') as f:
        data = pickle.load(f)
    return data

In [26]:
csv_file_metadata = DATA_PROCESSED_DIR + 'metadata.csv'
df_metadata = pd.read_csv(csv_file_metadata)
df_metadata = df_metadata[['track_name', 'artist_name']]
df_metadata

pkl_file_X = DATA_PROCESSED_DIR + 'data_X.pkl' # X
df_X = load_pkl(pkl_file_X)
df_X = df_X
df_X

pkl_file_y = DATA_PROCESSED_DIR + 'data_y.pkl' # y
df_y = load_pkl(pkl_file_y)
df_y = pd.DataFrame(df_y)
df_y

df = pd.concat([df_X, df_y], axis=1)
df = pd.concat([df_metadata.iloc[df.index.values], df], axis=1).reset_index(drop=True)
df

Unnamed: 0,track_name,artist_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,genre
0,Step Up,The Cheetah Girls,0.714286,0.074497,0.514427,0.036654,0.879877,0.000000,0.090909,0.073583,0.906392,1.0,0.043103,0.515119,0.75,0.336694,Electronic
1,Wide Open,The Chemical Brothers,0.744898,0.000728,0.791128,0.090401,0.703698,0.885312,0.000000,0.063765,0.773646,1.0,0.038739,0.467137,0.75,0.862903,Electronic
2,Flutes,Hot Chip,0.755102,0.019377,0.595177,0.063311,0.697692,0.000000,0.181818,0.323887,0.792554,1.0,0.120471,0.430578,0.75,0.479839,Electronic
3,Big Girls Cry - Odesza Remix,Sia,0.775510,0.042770,0.792205,0.048370,0.811808,0.000001,0.545455,0.090081,0.785479,0.0,0.016805,0.397972,0.75,0.868952,Electronic
4,Just Got Paid (feat. French Montana) - M-22 Remix,Sigala,0.724490,0.086043,0.551034,0.046592,0.763759,0.000003,0.090909,0.082186,0.883661,1.0,0.037866,0.563509,0.75,0.433468,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35534,Who Tweeted? (uncooked),Doug Benson,0.612245,0.124497,0.615633,0.042062,0.688682,0.000048,0.181818,0.147773,0.768197,1.0,0.004365,0.440658,0.75,0.900202,Rock
35535,What's in a Name,Comedian Bob Marley,0.571429,0.203814,0.599483,0.051806,0.401389,0.000000,0.363636,0.074899,0.681705,0.0,0.030663,0.261384,0.75,0.677419,Rock
35536,Smashed Pennies,Paul F. Tompkins,0.510204,0.033634,0.361542,0.051346,0.682676,0.000232,0.909091,0.298583,0.724362,1.0,0.096464,0.323079,0.75,0.363911,Rock
35537,"""JAY"" WALKING",Eddie Griffin,0.591837,0.138553,0.645780,0.020947,0.537528,0.000000,0.545455,0.045749,0.814574,0.0,0.008839,0.476109,0.75,0.894153,Rock


## Partie 2 - Sytème de recommandation item-based

In [19]:
df_features = df.copy()
df_features = df_features[df_features.columns[2:]]

# Get one hot encoding of columns 'genre'
one_hot = pd.get_dummies(df_features['genre'])

# Drop column B as it is now encoded
df_features = df_features.drop('genre', axis = 1)

# Join the encoded df
df_features = df_features.join(one_hot)
df_features

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,Classical,Electronic,Hip-Hop,Jazz,Rock
0,0.714286,0.074497,0.514427,0.036654,0.879877,0.000000,0.090909,0.073583,0.906392,1.0,0.043103,0.515119,0.75,0.336694,0,1,0,0,0
1,0.744898,0.000728,0.791128,0.090401,0.703698,0.885312,0.000000,0.063765,0.773646,1.0,0.038739,0.467137,0.75,0.862903,0,1,0,0,0
2,0.755102,0.019377,0.595177,0.063311,0.697692,0.000000,0.181818,0.323887,0.792554,1.0,0.120471,0.430578,0.75,0.479839,0,1,0,0,0
3,0.775510,0.042770,0.792205,0.048370,0.811808,0.000001,0.545455,0.090081,0.785479,0.0,0.016805,0.397972,0.75,0.868952,0,1,0,0,0
4,0.724490,0.086043,0.551034,0.046592,0.763759,0.000003,0.090909,0.082186,0.883661,1.0,0.037866,0.563509,0.75,0.433468,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35534,0.612245,0.124497,0.615633,0.042062,0.688682,0.000048,0.181818,0.147773,0.768197,1.0,0.004365,0.440658,0.75,0.900202,0,0,0,0,1
35535,0.571429,0.203814,0.599483,0.051806,0.401389,0.000000,0.363636,0.074899,0.681705,0.0,0.030663,0.261384,0.75,0.677419,0,0,0,0,1
35536,0.510204,0.033634,0.361542,0.051346,0.682676,0.000232,0.909091,0.298583,0.724362,1.0,0.096464,0.323079,0.75,0.363911,0,0,0,0,1
35537,0.591837,0.138553,0.645780,0.020947,0.537528,0.000000,0.545455,0.045749,0.814574,0.0,0.008839,0.476109,0.75,0.894153,0,0,0,0,1


In [24]:
features_matrix = df_features[df_features.columns[1:]].to_numpy()
cos_sim = cosine_similarity(features_matrix)
cos_sim

array([[1.        , 0.89475889, 0.98319509, ..., 0.70457551, 0.5980607 ,
        0.72745293],
       [0.89475889, 1.        , 0.90695539, ..., 0.62475612, 0.60603839,
        0.71632234],
       [0.98319509, 0.90695539, 1.        , ..., 0.72363507, 0.61304608,
        0.73581325],
       ...,
       [0.70457551, 0.62475612, 0.72363507, ..., 1.        , 0.82298845,
        0.83941275],
       [0.5980607 , 0.60603839, 0.61304608, ..., 0.82298845, 1.        ,
        0.84065408],
       [0.72745293, 0.71632234, 0.73581325, ..., 0.83941275, 0.84065408,
        1.        ]])

In [25]:
df_cos_sim = pd.DataFrame(cos_sim)
df_cos_sim

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35529,35530,35531,35532,35533,35534,35535,35536,35537,35538
0,1.000000,0.894759,0.983195,0.831677,0.997165,0.978095,0.748883,0.976021,0.829610,0.965793,...,0.734454,0.711513,0.728577,0.743797,0.760411,0.757312,0.570494,0.704576,0.598061,0.727453
1,0.894759,1.000000,0.906955,0.810543,0.907489,0.934461,0.802256,0.859297,0.810995,0.880803,...,0.665742,0.649880,0.647305,0.682295,0.686718,0.744279,0.577899,0.624756,0.606038,0.716322
2,0.983195,0.906955,1.000000,0.849099,0.988111,0.980907,0.770406,0.968813,0.846897,0.972306,...,0.749334,0.743679,0.736225,0.755406,0.735922,0.770328,0.588313,0.723635,0.613046,0.735813
3,0.831677,0.810543,0.849099,1.000000,0.841725,0.840114,0.920175,0.842020,0.999764,0.875071,...,0.681960,0.669174,0.645623,0.670361,0.587498,0.673921,0.718441,0.617587,0.764812,0.631038
4,0.997165,0.907489,0.988111,0.841725,1.000000,0.980953,0.756388,0.973037,0.840278,0.963825,...,0.737731,0.712957,0.726954,0.742660,0.760680,0.769693,0.584719,0.703194,0.614016,0.742988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35534,0.757312,0.744279,0.770328,0.673921,0.769693,0.756392,0.574216,0.734445,0.672631,0.762232,...,0.922753,0.914261,0.913952,0.950414,0.941840,1.000000,0.875229,0.905965,0.878203,0.967048
35535,0.570494,0.577899,0.588313,0.718441,0.584719,0.584677,0.654967,0.577810,0.718556,0.609234,...,0.842711,0.832080,0.829388,0.854601,0.794902,0.875229,1.000000,0.815997,0.986377,0.856525
35536,0.704576,0.624756,0.723635,0.617587,0.703194,0.689790,0.624274,0.762474,0.614481,0.765708,...,0.986897,0.988342,0.993417,0.984891,0.903307,0.905965,0.815997,1.000000,0.822988,0.839413
35537,0.598061,0.606038,0.613046,0.764812,0.614016,0.601021,0.699408,0.615592,0.764626,0.641285,...,0.866134,0.847367,0.840747,0.861162,0.806072,0.878203,0.986377,0.822988,1.000000,0.840654


In [48]:
def get_recommendations(track_name, df, cosine_sim_matrix, topN=None, threshold=None):
        # Get the index of the sport that matches the track_name
        idx = df[df['track_name']==track_name].index[0]

        # Get the pairwsie similarity scores
        sim_scores = list(enumerate(cosine_sim_matrix[idx]))
        
        # Sort the track_names based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        if topN is not None:
            # Get the scores for N most similar track_names
            sim_scores = sim_scores[1:topN+1]
        elif threshold is not None:
            # Get score for track_names with score greater than threshold
            sim_scores = [sim_score for sim_score in sim_scores[1:] if sim_score[1]>threshold]
        else:
            raise ValueError(f"Please select recommendation type: topN (integer in [1, {df.shape[0]-1}]) or threshold (integer in [0, 100])")
        
        # Get the track_name indices and similarity track_name score
        best_track_indices = [i[0] for i in sim_scores]
        best_track_sim_score = [i[1] for i in sim_scores]
        
        # Return the top N most similar track_name
        df_top = df.iloc[best_track_indices].copy()
        df_top['similarity_score'] = best_track_sim_score
        
        df_top.insert(0, 'track_name_target', track_name)
        df_top = df_top.reset_index(drop=True)
        
        return df_top

In [49]:
topN = 10
threshold = 0.9
track_name_target = 'Smashed Pennies'

df_reco = get_recommendations(
    track_name=track_name_target, 
    df=df[['track_name', 'artist_name', 'popularity', 'genre']], 
    cosine_sim_matrix=cos_sim, 
    topN=topN, 
    threshold=threshold
)
df_reco

Unnamed: 0,track_name_target,track_name,artist_name,popularity,genre,similarity_score
0,Smashed Pennies,Good Thing Go,Quinn XCII,0.540816,Rock,0.994948
1,Smashed Pennies,Crazy Cult,Maria Bamford,0.602041,Rock,0.993417
2,Smashed Pennies,Jump Out The Window,Big Sean,0.510204,Rock,0.993364
3,Smashed Pennies,RONDO,6ix9ine,0.520408,Rock,0.992711
4,Smashed Pennies,"Requiem, K. 626: Dies irae",Wolfgang Amadeus Mozart,0.744898,Rock,0.992569
5,Smashed Pennies,Loca,Alvaro Soler,0.55102,Rock,0.992479
6,Smashed Pennies,Jefe (feat. Meek Mill),T.I.,0.591837,Rock,0.992359
7,Smashed Pennies,How to Dune,John Heffron,0.612245,Rock,0.992163
8,Smashed Pennies,Me Likey,Trippie Redd,0.591837,Rock,0.99202
9,Smashed Pennies,"Symphony in D Major, K. 385 ""Haffner-Sinfonie""...",Wolfgang Amadeus Mozart,0.744898,Rock,0.991703
