In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import euclidean

# Loading dataset
songs_df = pd.read_csv("cleaned_data/songs_cleaned.csv")

# Selecting attributes/features to be included in similarity calculation 
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

# Normalize features (0-1 range)
scaler = MinMaxScaler()
songs_df_scaled = songs_df.copy()
songs_df_scaled[features] = scaler.fit_transform(songs_df[features])

In [2]:
songs_df["track_name"].value_counts()

track_name
Heaven                    6
You                       6
One                       5
Stay                      5
Changes                   5
                         ..
There Was A Time          1
Since I Don't Have You    1
Civil War                 1
Double Talkin' Jive       1
If The World              1
Name: count, Length: 8107, dtype: int64

In [3]:
# Altering function to take a playlist an input 


# Function that takes single input song and outputs similar songs using Euclidean distance
def recommend_songs_from_playlist(playlist, top_n=5):

    if len(playlist) < 1:
        return "Playlist empty"
    
    # Making sure all songs in playlist are in database
    for song in playlist:
        if song not in songs_df_scaled['track_name'].values:
            return f"Song {song} not found in dataset."

    # Getting average feature vector for the playlist
    playlist_vector = songs_df_scaled[songs_df_scaled['track_name'].isin(playlist)][features].mean().values

    # Removing playlist songs from song df (so the system doesn't recommend the input songs)
    other_songs_df_scaled = songs_df_scaled[~songs_df_scaled['track_name'].isin(playlist)].copy()

    # Computing distances to all other songs
    other_songs_df_scaled.loc[:, 'distance'] = other_songs_df_scaled[features].apply(lambda x: euclidean(playlist_vector, x), axis=1)

    # Getting top N closest songs 
    recommendations = other_songs_df_scaled.sort_values(by='distance')[1:top_n+1]

    return recommendations[['track_name', 'track_artist', 'distance']]


In [7]:
# Example(s)

# NOTE: both examples outputted two different versions of a song; when we clean, 
# should we be sorting out duplicate song/artist combos? or is it all right? 

# Other NOTE: should we be inputting more than song names in playlists? 
# Multiple versions of "Here Comes the Sun", for example

# playlist1 = ["higher love", "sweet dreams (are made of this)"]
playlist1 = ["Clarity"]
print(recommend_songs_from_playlist(playlist1, top_n=5))


# playlist2 = ["lose yourself to dance", "stole the show", "barbie girl"]
# print(recommend_songs_from_playlist(playlist2, top_n=5))


                                   track_name  track_artist  distance
8447                Deal Breaker - Dyro Remix       LOOPERS  0.080988
8366                              Born To Run      Afrojack  0.103496
8388                      People Of The Night   Dash Berlin  0.110972
8134  You Are Like Nobody Else - Original Mix  Swanky Tunes  0.111028
8480                                If I Fall   Cole Plante  0.118417
