In [9]:
from numpy import linalg
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, scale, Normalizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('spotify_songs_final_filtered.csv')
df.shape

(455285, 24)

In [3]:
df.columns

Index(['track_id', 'artist', 'song_name', 'popularity', 'album', 'isrc',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [4]:
df.iloc[:2][:1]

Unnamed: 0,track_id,artist,song_name,popularity,album,isrc,danceability,energy,key,loudness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,4YYCyU5iK3l71ksslONDAd,''Sugar Boy'' Crawford,Jock-A-Mo,4,"Louisiana Jazz, Rhythm & Blues - From the Swam...",USMC15350776,0.54,0.587,7,-7.982,...,0.245,0.948,168.823,audio_features,4YYCyU5iK3l71ksslONDAd,spotify:track:4YYCyU5iK3l71ksslONDAd,https://api.spotify.com/v1/tracks/4YYCyU5iK3l7...,https://api.spotify.com/v1/audio-analysis/4YYC...,148800,4


In [43]:
#preprocess
feat = df[['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']].copy()
feat = normalize(feat, norm='l2')
feat.shape

(455285, 12)

In [15]:
linalg.norm(feat[0] - feat[1:], axis=1)

array([0.16037241, 0.05252121, 0.0463082 , ..., 0.02480705, 0.30151119,
       0.28076958])

In [16]:
linalg.norm(feat[0] - feat[1])

0.16037240670161668

In [70]:
#weights
cats = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
#user_inputs = [1,2,3,4,5,1,2,3,4,5,1,2]
user_inputs = [1,1,1,1,1,1,1,1,1,1,1,1]
#user_inputs = 1/(np.array(user_inputs)**(2)) not the relationship we want
weights = dict(zip(cats, user_inputs))
weights

{'acousticness': 1,
 'danceability': 1,
 'energy': 1,
 'instrumentalness': 1,
 'key': 1,
 'liveness': 1,
 'loudness': 1,
 'mode': 1,
 'popularity': 1,
 'speechiness': 1,
 'tempo': 1,
 'valence': 1}

In [44]:
feat = pd.DataFrame(feat, columns=weights)
feat

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.023639,0.003191,0.003469,0.041368,-0.047171,0.005910,0.000351,0.004172,0.000000e+00,0.001448,0.005602,0.997696
1,0.182105,0.004811,0.004522,0.053114,-0.059184,0.007588,0.000414,0.005698,2.837798e-08,0.000511,0.006837,0.979966
2,0.072938,0.003005,0.003793,0.058351,-0.050663,0.007294,0.000607,0.006389,0.000000e+00,0.000579,0.002786,0.994275
3,0.060733,0.003894,0.004517,0.068324,-0.042801,0.007592,0.000686,0.005762,4.995264e-06,0.000758,0.002961,0.994824
4,0.073607,0.004837,0.005174,0.094638,-0.082766,0.010515,0.001157,0.008297,4.626757e-07,0.001514,0.001840,0.989211
...,...,...,...,...,...,...,...,...,...,...,...,...
455280,0.281057,0.001569,0.002658,0.011711,-0.209177,0.000000,0.000429,0.010153,2.810573e-05,0.001276,0.000432,0.936483
455281,0.372378,0.006621,0.005888,0.023274,-0.097808,0.011637,0.000712,0.007040,2.955754e-06,0.001338,0.006668,0.922451
455282,0.027125,0.005633,0.005063,0.054250,-0.065516,0.009042,0.000424,0.000002,7.839153e-03,0.000889,0.005705,0.995889
455283,0.317293,0.003133,0.009092,0.000000,-0.027000,0.009915,0.000796,0.000011,1.576549e-07,0.002578,0.003282,0.947833


In [71]:
test = user_inputs * feat
test

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.023639,0.003191,0.003469,0.041368,-0.047171,0.005910,0.000351,0.004172,0.000000e+00,0.001448,0.005602,0.997696
1,0.182105,0.004811,0.004522,0.053114,-0.059184,0.007588,0.000414,0.005698,2.837798e-08,0.000511,0.006837,0.979966
2,0.072938,0.003005,0.003793,0.058351,-0.050663,0.007294,0.000607,0.006389,0.000000e+00,0.000579,0.002786,0.994275
3,0.060733,0.003894,0.004517,0.068324,-0.042801,0.007592,0.000686,0.005762,4.995264e-06,0.000758,0.002961,0.994824
4,0.073607,0.004837,0.005174,0.094638,-0.082766,0.010515,0.001157,0.008297,4.626757e-07,0.001514,0.001840,0.989211
...,...,...,...,...,...,...,...,...,...,...,...,...
455280,0.281057,0.001569,0.002658,0.011711,-0.209177,0.000000,0.000429,0.010153,2.810573e-05,0.001276,0.000432,0.936483
455281,0.372378,0.006621,0.005888,0.023274,-0.097808,0.011637,0.000712,0.007040,2.955754e-06,0.001338,0.006668,0.922451
455282,0.027125,0.005633,0.005063,0.054250,-0.065516,0.009042,0.000424,0.000002,7.839153e-03,0.000889,0.005705,0.995889
455283,0.317293,0.003133,0.009092,0.000000,-0.027000,0.009915,0.000796,0.000011,1.576549e-07,0.002578,0.003282,0.947833


In [72]:
seed_index = 0
test['scores']= linalg.norm(test.iloc[seed_index] - test, axis=1)
test['scores']

0         0.000000
1         0.160372
2         0.052521
3         0.046308
4         0.082050
            ...   
455280    0.311829
455281    0.360877
455282    0.024807
455283    0.301511
455284    0.280770
Name: scores, Length: 455285, dtype: float64

In [73]:
rec = test.sort_values("scores").index[:25]
df.iloc[rec][['song_name', 'artist']+cats]

Unnamed: 0,song_name,artist,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Jock-A-Mo,''Sugar Boy'' Crawford,4,0.54,0.587,7,-7.982,1,0.0594,0.706,0.0,0.245,0.948,168.823
384398,Girl In the Blue Velvet Band,The Del McCoury Band,4,0.528,0.496,7,-7.778,1,0.0312,0.634,0.000742,0.33,0.82,165.206
280240,Lovers Jamboree,Nick Lowe,4,0.424,0.901,7,-8.429,1,0.0331,0.517,0.00153,0.429,0.807,173.444
406792,O-o-h Child,The Spinners,4,0.473,0.686,8,-9.314,1,0.074,0.722,0.0,0.0883,0.921,187.793
118060,The Sun Is Shining - Alternate Take,Elmore James,4,0.317,0.796,7,-7.914,1,0.0682,0.595,0.0296,0.587,0.716,174.535
291706,Darlin',Pat Mears,4,0.493,0.82,7,-8.218,1,0.0574,0.121,0.122,0.16,0.839,173.421
237432,The Truth,Lloyd Price,4,0.585,0.653,8,-8.763,1,0.11,0.931,0.142,0.125,0.93,179.117
17321,Limbo - Rudy Van Gelder Edition,Andrew Hill,4,0.375,0.628,7,-8.584,1,0.0518,0.547,0.00159,0.0771,0.46,175.373
398325,Love-In,The Morning Glories,3,0.496,0.771,5,-5.612,1,0.0333,0.614,0.00252,0.138,0.692,120.072
305880,One More 'Fore I Die,Preservation Hall Jazz Band,4,0.557,0.599,7,-8.612,1,0.0408,0.81,0.0145,0.133,0.808,167.436


In [69]:
weights

{'acousticness': 3,
 'danceability': 2,
 'energy': 3,
 'instrumentalness': 4,
 'key': 4,
 'liveness': 5,
 'loudness': 5,
 'mode': 1,
 'popularity': 1,
 'speechiness': 2,
 'tempo': 2,
 'valence': 1}

In [68]:
test.iloc[rec]

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,scores
0,0.023639,0.006382,0.010407,0.165472,-0.235857,0.00591,0.000702,0.012517,0.0,0.007239,0.005602,1.995391,0.0
384398,0.024156,0.006377,0.008986,0.169089,-0.234852,0.006039,0.000377,0.011486,1.79234e-05,0.009964,0.004952,1.995319,0.008732
271383,0.027115,0.005545,0.00968,0.162687,-0.234914,0.006779,0.000485,0.008663,0.001819386,0.009422,0.001607,1.995319,0.0136
189583,0.029701,0.00322,0.008322,0.166327,-0.232235,0.00594,0.000396,0.01292,0.0,0.003653,0.00183,1.995158,0.016646
398325,0.024926,0.008242,0.019218,0.166174,-0.233142,0.008309,0.000553,0.015305,8.375174e-05,0.005733,0.00575,1.995286,0.017872
187237,0.01669,0.008178,0.016923,0.166898,-0.237078,0.008345,0.000596,0.015021,2.17301e-05,0.00509,0.004356,1.995564,0.018648
87996,0.030288,0.00699,0.010867,0.169612,-0.23461,0.006058,0.00039,0.005052,2.544178e-06,0.006663,0.004798,1.994987,0.019011
191835,0.017418,0.005016,0.014091,0.162566,-0.239001,0.0,0.000446,0.008709,0.0005945259,0.005864,0.00382,1.995703,0.019556
280240,0.023009,0.004878,0.015548,0.161063,-0.242429,0.005752,0.000381,0.008922,3.520384e-05,0.012339,0.004642,1.995391,0.019877
240503,0.017147,0.004572,0.009482,0.160036,-0.241626,0.005716,0.000528,0.010802,1.634654e-07,0.003115,0.003024,1.995695,0.02019
