In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

## Read the csv files

In [2]:
df1 = pd.read_csv("song_data.csv")
df2 = pd.read_csv("song_info.csv")
df = pd.merge(df1,df2,on='song_name')
df = df.drop_duplicates(subset=['song_name','artist_name']).reset_index()
df = df.drop('index',axis=1)

In [3]:
df.head()

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,artist_name,album_names,playlist
0,Boulevard of Broken Dreams,73,262333,0.00552,0.496,0.682,2.9e-05,8,0.0589,-4.095,1,0.0294,167.06,4,0.474,Green Day,Greatest Hits: God's Favorite Band,00s Rock Anthems
1,In The End,66,216933,0.0103,0.542,0.853,0.0,3,0.108,-6.407,0,0.0498,105.256,4,0.37,Linkin Park,Hybrid Theory,00s Rock Anthems
2,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324,The White Stripes,Elephant,00s Rock Anthems
3,Seven Nation Army,76,231733,0.00817,0.737,0.463,0.447,0,0.255,-7.828,1,0.0792,123.881,4,0.324,Zella Day,Seven Nation Army,Acoustic Covers
4,By The Way,74,216933,0.0264,0.451,0.97,0.00355,0,0.102,-4.938,1,0.107,122.444,4,0.198,Red Hot Chili Peppers,By The Way (Deluxe Version),00s Rock Anthems


## Normalizing data

In [4]:
minmaxscaler = MinMaxScaler()
minmaxscaled = minmaxscaler.fit_transform(df[['song_popularity', 'acousticness',
    'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
    'loudness', 'audio_mode', 'speechiness', 'tempo','audio_valence']])
songs_normalized = pd.DataFrame(minmaxscaled, columns=[['song_popularity', 'acousticness',
    'danceability', 'energy', 'instrumentalness', 'key', 'liveness',
    'loudness', 'audio_mode', 'speechiness', 'tempo','audio_valence']])

songs_normalized.insert(0, 'song_name', df['song_name'])
songs_normalized.insert(1, 'artist_name', df['artist_name'])


In [5]:
songs_normalized.head()

Unnamed: 0,song_name,artist_name,song_popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,audio_valence
0,Boulevard of Broken Dreams,Green Day,0.73,0.005541,0.502533,0.682342,2.9e-05,0.727273,0.04825,0.859242,1.0,0.031243,0.689425,0.481707
1,In The End,Linkin Park,0.66,0.01034,0.549139,0.853697,0.0,0.272727,0.098655,0.801948,0.0,0.052922,0.434371,0.376016
2,Seven Nation Army,The White Stripes,0.76,0.008202,0.746707,0.462888,0.448345,0.0,0.249564,0.766734,1.0,0.084166,0.511233,0.329268
3,Seven Nation Army,Zella Day,0.76,0.008202,0.746707,0.462888,0.448345,0.0,0.249564,0.766734,1.0,0.084166,0.511233,0.329268
4,By The Way,Red Hot Chili Peppers,0.74,0.026505,0.45694,0.97094,0.003561,0.0,0.092496,0.838352,1.0,0.113709,0.505303,0.20122


In [6]:
song_features=songs_normalized.set_index("song_name")
song_features.drop(["artist_name"],axis=1,inplace=True)
song_features.head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0_level_0,song_popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,audio_valence
song_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"(Boulevard of Broken Dreams,)",0.73,0.005541,0.502533,0.682342,2.9e-05,0.727273,0.04825,0.859242,1.0,0.031243,0.689425,0.481707
"(In The End,)",0.66,0.01034,0.549139,0.853697,0.0,0.272727,0.098655,0.801948,0.0,0.052922,0.434371,0.376016
"(Seven Nation Army,)",0.76,0.008202,0.746707,0.462888,0.448345,0.0,0.249564,0.766734,1.0,0.084166,0.511233,0.329268
"(Seven Nation Army,)",0.76,0.008202,0.746707,0.462888,0.448345,0.0,0.249564,0.766734,1.0,0.084166,0.511233,0.329268
"(By The Way,)",0.74,0.026505,0.45694,0.97094,0.003561,0.0,0.092496,0.838352,1.0,0.113709,0.505303,0.20122


In [30]:
song_features_csr = csr_matrix(song_features.values)

model_nn = NearestNeighbors(metric='cosine',algorithm='brute')
model_nn.fit(song_features_csr)

temp = song_features.copy()
temp.reset_index(inplace=True)
songsearch = 'Despacito (Featuring Daddy Yankee)'

song_index = df[df['song_name']==songsearch].index.values
song = songs_normalized.iloc[song_index[0]]


distances,indices = model_nn.kneighbors(X = song_features.iloc[song_index,:].values.reshape(1,-1), n_neighbors=6)



for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Recommendations for ",song['song_name'],"are: ")
    else:
        rec_song = songs_normalized.iloc[indices.flatten()[i]]
        print(i,": ",rec_song['song_name'],' by ',rec_song['artist_name'],"| distance= ",distances.flatten()[i])

Recommendations for  Despacito (Featuring Daddy Yankee) are: 
1 :  Genie in a Bottle  by  Christina Aguilera | distance=  0.0019276815943389813
2 :  Happy  by  Pharrell Williams | distance=  0.0038959460126180012
3 :  Bad Boy  by  Red Velvet | distance=  0.003912874233403918
4 :  Waiting On the World to Change  by  John Mayer | distance=  0.005950806272670528
5 :  Corazón partio  by  Alejandro Sanz | distance=  0.006017553689370736
