In [1]:
import pandas as pd
import numpy as np 
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

In [2]:
dataset = pd.read_csv(os.getcwd() + '/spotify_songs.csv')
dataset.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [3]:
print(dataset.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


In [4]:
categorical_features = ['track_artist',  
                        'track_album_name', 
                        'playlist_name', 
                        'playlist_genre', 
                        'playlist_subgenre']

numeric_features = ['track_popularity',
                    'danceability', 
                    'energy', 
                    'key', 
                    'loudness', 
                    'mode', 
                    'speechiness',
                    'acousticness', 
                    'instrumentalness', 
                    'liveness', 
                    'valence', 
                    'tempo']

test_categorical = [
                    'playlist_genre',
                    'playlist_subgenre']

In [5]:
encoder = OneHotEncoder()
# Changed to use test_categorical, a set of variables that doesn't have the 10k+ entry problem mainly for testing
encoded_categorical = encoder.fit_transform(dataset[test_categorical]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(test_categorical))

encoded_categorical_df.head()

Unnamed: 0,playlist_genre_edm,playlist_genre_latin,playlist_genre_pop,playlist_genre_r&b,playlist_genre_rap,playlist_genre_rock,playlist_subgenre_album rock,playlist_subgenre_big room,playlist_subgenre_classic rock,playlist_subgenre_dance pop,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(dataset[numeric_features])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numeric_features)

scaled_numerical_df.head()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.941531,0.642049,1.201614,0.1732,1.367123,0.876177,-0.481362,-0.333898,-0.377953,-0.80923,0.031908,0.042927
1,0.981557,0.490412,0.643317,1.557627,0.585766,0.876177,-0.688642,-0.46867,-0.359177,1.081061,0.782522,-0.777198
2,1.101635,0.138889,1.284529,-1.211227,1.10009,-1.141322,-0.324422,-0.436799,-0.377849,-0.519562,0.439384,0.116227
3,0.701374,0.435271,1.279002,0.450085,0.984309,0.876177,-0.050024,-0.667642,-0.377911,0.089582,-1.001795,0.039953
4,1.061609,-0.033426,0.742815,-1.211227,0.685151,0.876177,-0.70246,-0.432701,-0.377953,-0.692585,0.919777,0.115037


In [7]:
processed_features_df = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)
processed_features_df.head()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,0.941531,0.642049,1.201614,0.1732,1.367123,0.876177,-0.481362,-0.333898,-0.377953,-0.80923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.981557,0.490412,0.643317,1.557627,0.585766,0.876177,-0.688642,-0.46867,-0.359177,1.081061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.101635,0.138889,1.284529,-1.211227,1.10009,-1.141322,-0.324422,-0.436799,-0.377849,-0.519562,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.701374,0.435271,1.279002,0.450085,0.984309,0.876177,-0.050024,-0.667642,-0.377911,0.089582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.061609,-0.033426,0.742815,-1.211227,0.685151,0.876177,-0.70246,-0.432701,-0.377953,-0.692585,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X, y = processed_features_df, dataset['playlist_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

knn = KNeighborsClassifier(n_neighbors=15)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9759439707673568
              precision    recall  f1-score   support

         edm       0.95      0.99      0.97       583
       latin       0.98      0.96      0.97       529
         pop       1.00      0.96      0.98       524
         r&b       0.99      0.98      0.99       569
         rap       0.96      0.98      0.97       580
        rock       0.98      0.98      0.98       499

    accuracy                           0.98      3284
   macro avg       0.98      0.98      0.98      3284
weighted avg       0.98      0.98      0.98      3284



In [20]:
def recommend_top_5_songs(input_song, model, feature_data, original_data, numerical_features, test_categorical, scaler, encoder, top_n=5):
    """
    Function that returns the N top songs recommended based on similarity to input song.
    
    input_song: dictionary containing teh features of the input song.
    model: the trained machine learning model
    feature_data: the dataframe containing the scaled and encoded of the processed features.
    original_data: the original dataset containing the song information.
    numerical_features: list of numerical feature names
    test_categorical: list of categorical feature names
    scaler: fitted scaler for numerical features
    encoder: fitted encoder for categorical features
    top_n: nb of recommendations to return (default is 5)
    """
    
    # convert input_song into dataframe for prediction
    input_song_df = pd.DataFrame([input_song])
    
    # features scaled and encoded like the training data
    input_song_scaled = scaler.transform(input_song_df[numerical_features])
    input_song_encoded = encoder.transform(input_song_df[test_categorical]).toarray()
    
    #combine scaled and encoded features
    processed_input_song = np.concatenate([input_song_scaled, input_song_encoded], axis=1)
    
    #predict the 5 nearest neighbors
    distances, indices = model.kneighbors(processed_input_song, n_neighbors=top_n)
    
    #get top n recommended songs
    recommended_songs = original_data.iloc[indices[0]]
    
    return recommended_songs

#example input song
input_song = {
    "track_popularity": 80,
    "danceability": 0.7,
    "energy": 0.8,
    "key": 5,
    "loudness": -5.0,
    "mode": 1,
    "speechiness": 0.05,
    "acousticness": 0.2,
    "instrumentalness": 0.0,
    "liveness": 0.1,
    "valence": 0.9,
    "tempo": 120.0,
    "playlist_genre": "pop",
    "playlist_subgenre": "dance pop"
}
top_recommended_songs = recommend_top_5_songs(
    input_song=input_song,
    model=knn,
    feature_data=processed_features_df,
    original_data=dataset,
    numerical_features=numeric_features,
    test_categorical=test_categorical,
    scaler=scaler,
    encoder=encoder,
    top_n=5
)
print(top_recommended_songs)

recommended_songs = top_recommended_songs[['track_name', 'track_artist']]
print("Recommended Songs:")
for i, row in recommended_songs.iterrows():
    print(f"{i + 1}. {row['track_name']} by {row['track_artist']}")

                     track_id                 track_name   track_artist  \
21559  6CeeoAi5qUa4ZjQo1S7Ho7              Somebody Else    Isac Elliot   
27699  23L5CiUhw2jV1OIMwthR3S        In the Name of Love  Martin Garrix   
24570  2MYtYwqRutASW0oVTYPuEX  She's Playing Hard To Get        Hi-Five   
28084  6qLkMLUIkdtX34gHF9sPFL                After Hours           M-22   
1094   6UFivO2zqqPFPoQYsEMuCc                       Bags         Clairo   

       track_popularity          track_album_id     track_album_name  \
21559                53  3MPdrq8TeWZkkzZ2jf7w8V        Somebody Else   
27699                80  1FOJ5IXGXe8dl0cXvCU6wK  In the Name of Love   
24570                42  1fAaBEaUkL23BjFTk0KN5P     Keep It Goin' On   
28084                68  1LD7wfM8EYjricnmu1EpWQ          After Hours   
1094                 70  4kkVGtCqE2NiAKosri9Rnd             Immunity   

      track_album_release_date  \
21559               2019-11-08   
27699               2016-07-29   
24570         



In [21]:
def recommend_top_5_songs_multiple(input_songs, model, feature_data, original_data, numerical_features, test_categorical, scaler, encoder, top_n=5):
    """
    Function to recommend the top N songs based on similarity to the averaged attributes of multiple input songs.
    """
    
    # convert input songs into a df
    input_songs_df = pd.DataFrame(input_songs)

    # calculate average of numerical features
    averaged_numerical = input_songs_df[numerical_features].mean().to_dict()

    # take the most frequent categorical features (genre / subgenre)
    averaged_categorical = {col: input_songs_df[col].mode()[0] for col in test_categorical}

    # combine numerical and categorical features into a single input
    averaged_input_song = {**averaged_numerical, **averaged_categorical}

    # Use the existing recommend function with the averaged attributes
    recommended_songs = recommend_top_5_songs(
        input_song=averaged_input_song,
        model=model,
        feature_data=feature_data,
        original_data=original_data,
        numerical_features=numerical_features,
        test_categorical=test_categorical,
        scaler=scaler,
        encoder=encoder,
        top_n=top_n,
    )

    return recommended_songs


In [22]:
# list of songs
input_songs = [
    {
        "track_popularity": 80,
        "danceability": 0.7,
        "energy": 0.8,
        "key": 5,
        "loudness": -5.0,
        "mode": 1,
        "speechiness": 0.05,
        "acousticness": 0.2,
        "instrumentalness": 0.0,
        "liveness": 0.1,
        "valence": 0.9,
        "tempo": 120.0,
        "playlist_genre": "pop",
        "playlist_subgenre": "dance pop"
    },
    {
        "track_popularity": 60,
        "danceability": 0.6,
        "energy": 0.7,
        "key": 3,
        "loudness": -7.0,
        "mode": 0,
        "speechiness": 0.04,
        "acousticness": 0.3,
        "instrumentalness": 0.1,
        "liveness": 0.2,
        "valence": 0.8,
        "tempo": 115.0,
        "playlist_genre": "pop",
        "playlist_subgenre": "indie pop"
    }
]

# recommend songs based on the averaged attributes
recommended_songs = recommend_top_5_songs_multiple(
    input_songs=input_songs,
    model=knn,
    feature_data=processed_features_df,
    original_data=dataset,
    numerical_features=numeric_features,
    test_categorical=test_categorical,
    scaler=scaler,
    encoder=encoder,
    top_n=5
)

print("Recommended Songs:")
for i, row in recommended_songs.iterrows():
    print(f"{i + 1}. {row['track_name']} by {row['track_artist']}")


Recommended Songs:
17416. Anywhere - Willy William Remix by Rita Ora
25857. Whatever by Jill Scott
16871. El Niágra En Bicicleta - En Vivo Estadio Olímpico De República Dominicana/2012 by Juan Luis Guerra 4.40
5793. Kaam 25 - Sacred Games by DIVINE
2427. Year 3000 by Jonas Brothers


