In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import euclidean

# Loading dataset
songs_df = pd.read_csv("cleaned_data/songs_cleaned.csv")

# Selecting attributes/features to be included in similarity calculation 
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

# Normalize features (0-1 range)
scaler = MinMaxScaler()
songs_df_scaled = songs_df.copy()
songs_df_scaled[features] = scaler.fit_transform(songs_df[features])

In [8]:
songs_df["track_name"].value_counts()

track_name
Poison                         22
Breathe                        21
Alive                          20
Forever                        20
Stay                           19
                               ..
Do That To Me One More Time     1
Goodbye Girl                    1
Lotta Love                      1
Too Late To Turn Back Now       1
It Doesn't Matter               1
Name: count, Length: 13352, dtype: int64

In [9]:
# Altering function to take a playlist an input 


# Function that takes single input song and outputs similar songs using Euclidean distance
def recommend_songs_from_playlist(playlist, top_n=5):

    if len(playlist) < 1:
        return "Playlist empty"
    
    # Making sure all songs in playlist are in database
    for song in playlist:
        if song not in songs_df_scaled['track_name'].values:
            return f"Song {song} not found in dataset."

    # Getting average feature vector for the playlist
    playlist_vector = songs_df_scaled[songs_df_scaled['track_name'].isin(playlist)][features].mean().values

    # Removing playlist songs from song df (so the system doesn't recommend the input songs)
    other_songs_df_scaled = songs_df_scaled[~songs_df_scaled['track_name'].isin(playlist)].copy()

    # Computing distances to all other songs
    other_songs_df_scaled.loc[:, 'distance'] = other_songs_df_scaled[features].apply(lambda x: euclidean(playlist_vector, x), axis=1)

    # Getting top N closest songs 
    recommendations = other_songs_df_scaled.sort_values(by='distance')[1:top_n+1]

    return recommendations[['track_name', 'track_artist', 'distance']]


In [10]:
songs_df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6oJ6le65B3SEqPwMRNXWjY,Higher Love,Kygo,87,4wquJImu8RtyEuDtIAsfcE,Higher Love,2019-06-28,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,8,-7.159,1,0.0324,0.0154,6e-06,0.101,0.404,103.952,228267
1,2tilX6FUl0IwFgiOYjuc6V,Unlove You,Armin van Buuren,62,6k63cIUizpLu0h5hhIg8LG,Unlove You,2019-10-18,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,3,-3.66,0,0.0597,0.0303,3.3e-05,0.143,0.663,116.892,150160
2,1ppOGANOzMT01Cxh9jx4hN,Cool Kids,Olympis,67,1ZmJJdNN2BAnVqoiU4xtZe,Cool Kids,2019-11-01,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,8,-11.297,1,0.0511,0.415,3.1e-05,0.13,0.119,126.1,139048
3,05CwHjIk71RXVU40boRMnR,Call You Mine,The Chainsmokers,39,1ONuDpN0a3zhCUyKCgtuzK,World War Joy,2019-05-31,Dance Pop,37i9dQZF1DWZQaaqNMbbXa,pop,...,7,-5.59,1,0.0289,0.225,0.0,0.414,0.501,104.003,217653
4,5icOoE6VgqFKohjWWNp0Ac,Here With Me,Marshmello,83,6NHS3hV16MZyfcp0nSHdrd,Here With Me,2019-03-08,Dance Pop,37i9dQZF1DWZQaaqNMbbXa,pop,...,5,-3.933,0,0.0439,0.0623,0.0,0.156,0.181,99.961,156347


In [12]:
# Example(s)

# NOTE: both examples outputted two different versions of a song; when we clean, 
# should we be sorting out duplicate song/artist combos? or is it all right? 

# Other NOTE: should we be inputting more than song names in playlists? 
# Multiple versions of "Here Comes the Sun", for example

playlist1 = ["Higher Love", "Sweet Dreams (Are Made of This)"]
print(recommend_songs_from_playlist(playlist1, top_n=5))

# playlist2 = ["lose yourself to dance", "stole the show", "barbie girl"]
# print(recommend_songs_from_playlist(playlist2, top_n=5))


                 track_name    track_artist  distance
3356   Stay a Little Longer     Lostboycrow  0.108815
1307              Paparazzi       Lady Gaga  0.110252
14382                  2 On         Tinashe  0.113623
10588            No Diggity  Campsite Dream  0.123341
2941             No Diggity  Campsite Dream  0.123341
