In [2]:
from numpy import linalg
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize, scale, Normalizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('spotify_songs_final_filtered.csv')
df.shape

(455285, 24)

In [4]:
df.columns

Index(['track_id', 'artist', 'song_name', 'popularity', 'album', 'isrc',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

In [5]:
df.iloc[:2][:1]

Unnamed: 0,track_id,artist,song_name,popularity,album,isrc,danceability,energy,key,loudness,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,4YYCyU5iK3l71ksslONDAd,''Sugar Boy'' Crawford,Jock-A-Mo,4,"Louisiana Jazz, Rhythm & Blues - From the Swam...",USMC15350776,0.54,0.587,7,-7.982,...,0.245,0.948,168.823,audio_features,4YYCyU5iK3l71ksslONDAd,spotify:track:4YYCyU5iK3l71ksslONDAd,https://api.spotify.com/v1/tracks/4YYCyU5iK3l7...,https://api.spotify.com/v1/audio-analysis/4YYC...,148800,4


In [6]:
#preprocess
feat = df[['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']].copy()
feat = normalize(feat, norm='l2')
feat.shape

(455285, 12)

In [9]:
#weights
cats = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
user_inputs = [1,2,3,4,5,1,2,3,4,5,1,2]
#user_inputs = [1,1,1,1,1,1,1,1,1,1,1,1]
#user_inputs = 1/(np.array(user_inputs)**(2)) not the relationship we want
#user_inputs = np.array(user_inputs)
weights = dict(zip(cats, user_inputs))
weights

{'acousticness': 3,
 'danceability': 2,
 'energy': 3,
 'instrumentalness': 4,
 'key': 4,
 'liveness': 5,
 'loudness': 5,
 'mode': 1,
 'popularity': 1,
 'speechiness': 2,
 'tempo': 2,
 'valence': 1}

In [10]:
feat = pd.DataFrame(feat, columns=weights)
feat.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.023639,0.003191,0.003469,0.041368,-0.047171,0.00591,0.000351,0.004172,0.0,0.001448,0.005602,0.997696
1,0.182105,0.004811,0.004522,0.053114,-0.059184,0.007588,0.000414,0.005698,2.837798e-08,0.000511,0.006837,0.979966
2,0.072938,0.003005,0.003793,0.058351,-0.050663,0.007294,0.000607,0.006389,0.0,0.000579,0.002786,0.994275
3,0.060733,0.003894,0.004517,0.068324,-0.042801,0.007592,0.000686,0.005762,4.995264e-06,0.000758,0.002961,0.994824
4,0.073607,0.004837,0.005174,0.094638,-0.082766,0.010515,0.001157,0.008297,4.626757e-07,0.001514,0.00184,0.989211


In [29]:
#scaled
test = user_inputs * feat
test.head()

Unnamed: 0,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.023639,0.006382,0.010407,0.165472,-0.235857,0.00591,0.000702,0.012517,0.0,0.007239,0.005602,1.995391
1,0.182105,0.009621,0.013567,0.212455,-0.29592,0.007588,0.000829,0.017095,1.135119e-07,0.002553,0.006837,1.959932
2,0.072938,0.00601,0.011378,0.233403,-0.253315,0.007294,0.001214,0.019168,0.0,0.002896,0.002786,1.98855
3,0.060733,0.007789,0.013551,0.273297,-0.214007,0.007592,0.001371,0.017286,1.998105e-05,0.003792,0.002961,1.989648
4,0.073607,0.009674,0.015521,0.378553,-0.413832,0.010515,0.002313,0.02489,1.850703e-06,0.007571,0.00184,1.978422


### PCA, KMeans

In [47]:
#test is the normalized df
kmeans = KMeans(n_clusters = 20, random_state=0, algorithm='full').fit(test)

In [48]:
#compress to only two main components
pca = PCA(n_components=2)
pca_df = pca.fit(test.T)
pca_df = pd.DataFrame(pca_df.components_.T, columns=['component_1', 'component_2'])

In [49]:
#add cluster assignments
pca_df['cluster']=kmeans.labels_
pca_df.cluster.value_counts()

0     52245
19    47724
11    45163
2     41830
10    35580
5     31531
13    30439
4     28664
16    26821
6     21032
9     19328
12    17485
8     16609
1     12301
7      9831
17     7749
18     7209
14     3030
15      444
3       270
Name: cluster, dtype: int64

In [100]:
#data to share with group
#pca_df.to_csv('full_data_set_pca.csv', index=False)
cluster_pca = pca_df.loc[pca_df['cluster']==12].copy()
cluster_pca.to_csv('cluster_data_set_pca.csv', index=False)

In [98]:
seed_index = 25 #this is the index of whatever seed song we put in
test['cluster']=kmeans.labels_

#do we want scores to only be calculated for the seed song's cluster? I think so
cluster = test.iloc[seed_index]['cluster']
cluster_df = test.loc[test['cluster']==cluster].copy()
cluster_df['scores']= linalg.norm(cluster_df.loc[cluster_df.index==seed_index].to_numpy() - cluster_df, axis=1)
#test['scores']= linalg.norm(test.iloc[seed_index] - test, axis=1)

cluster_df.reset_index(inplace=True, drop=False)
rec = cluster_df.sort_values("scores").index[:25]
rec = cluster_df.iloc[rec]
rec.set_index('index')
df.iloc[rec.index][['song_name', 'artist']+cats]

Unnamed: 0,song_name,artist,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
2,Brigadoon / Vendors' Calls / MacConnachy Square,'Brigadoon' 2017 New York City Center Ensemble,10,0.412,0.52,8,-6.946,1,0.0832,0.876,0.0,0.0794,0.382,136.317
10525,Tango Della Carlina,Alberto Baldan Bembo,0,0.573,0.465,2,-11.694,1,0.0528,0.447,0.872,0.222,0.621,118.555
400,I Guess It's Christmas Time,*NSYNC,5,0.467,0.526,6,-6.214,1,0.024,0.144,0.0,0.176,0.257,146.56
175,Feel Good,(Hed) P.E.,32,0.685,0.717,6,-7.477,0,0.215,0.00303,0.000665,0.0388,0.773,90.86
8830,Danny Boy - Remastered,Al Hibbler,0,0.216,0.212,0,-11.042,1,0.0308,0.827,0.009,0.334,0.317,85.73
4361,The Outsider - Commentary,A Perfect Circle,4,0.74,0.335,9,-15.032,1,0.941,0.561,0.0,0.277,0.83,83.899
11415,Out Of Love - Ruhde Remix,Alessia Cara,19,0.668,0.533,7,-8.78,1,0.0324,0.606,0.0009,0.171,0.198,127.995
4557,Separate Ways (Worlds Apart),A Skylit Drive,32,0.462,0.95,4,-4.944,0,0.101,0.000341,6e-06,0.146,0.345,157.923
11105,Bienvenido A La Vida - Remastered,Aleks Syntek,13,0.586,0.855,7,-6.785,1,0.0338,0.357,0.000661,0.0444,0.418,106.669
12278,no space,Alextbh,44,0.883,0.451,11,-5.214,0,0.0902,0.0707,0.0,0.517,0.772,109.995
