In [119]:
import pandas as pd
import numpy as np 
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

In [120]:
dataset = pd.read_csv(os.getcwd() + '/spotify_songs.csv')
dataset.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [121]:
print(dataset.columns)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


In [122]:
categorical_features = ['track_artist',  
                        'track_album_name', 
                        'playlist_name', 
                        'playlist_genre', 
                        'playlist_subgenre']

numeric_features = ['track_popularity',
                    'danceability', 
                    'energy', 
                    'key', 
                    'loudness', 
                    'mode', 
                    'speechiness',
                    'acousticness', 
                    'instrumentalness', 
                    'liveness', 
                    'valence', 
                    'tempo']

test_categorical = [
                    'playlist_genre',
                    'playlist_subgenre']

In [138]:
from sklearn.preprocessing import LabelEncoder
# LabelEncoder implementation for high-dimensional categorical variables.
# Fully captures dataset, but reuslts in very high inertia and will require many more clusters of KMeans.

label_encoded_columns = [
    'track_artist',
    'track_name',
    'track_album_name'
]

encoded_labels = dict()

for column in label_encoded_columns:
    encoded_labels[column] = LabelEncoder().fit_transform(dataset[column])

In [155]:
# Category Encoders, can create link between columns in dataset to reduce dimensionality of artists.
# Not certain what column to link together.

# import category_encoders as ce

# category_encoder = ce.TargetEncoder(cols=['track_artist'])
# artist_encoded = category_encoder.fit_transform(dataset['target_artist'])


# PCA Implementation for track_artist.
# Efficient, but struggling to capture whole dataset of artists.
# Also much harder to explain, as reducing from 10k+ artists.
from sklearn.decomposition import PCA

encoder = OneHotEncoder(sparse_output=False)
artist_encoded = encoder.fit_transform(dataset[['track_artist']])

pca = PCA(n_components=100)
artist_reduced = pca.fit_transform(artist_encoded)

# Want high Cumulative Explained Variance. If this is low, then we do not have enough components to capture the dataset.
print(f"Cumulative explained variance: {sum(pca.explained_variance_ratio_)}")

artist_reduced_df = pd.DataFrame(artist_reduced, columns=[f'PCA_{i+1}' for i in range(artist_reduced.shape[1])])
artist_reduced_df.head()

Cumulative explained variance: 0.15821540671594533


Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,...,PCA_91,PCA_92,PCA_93,PCA_94,PCA_95,PCA_96,PCA_97,PCA_98,PCA_99,PCA_100
0,-0.007359,-0.00709,-0.007279,-0.007324,-0.006998,-0.008953,-0.008362,-0.010411,-0.009665,-0.013765,...,0.000357,0.001068,0.001331,-0.000534,-0.00011,0.000584,-0.000343,0.000374,0.000824,-2.6e-05
1,-0.007523,-0.00731,-0.007561,-0.007702,-0.007453,-0.009576,-0.009129,-0.011464,-0.011168,-0.016096,...,0.000351,0.001016,0.001256,-0.000495,-0.00011,0.000567,-0.00033,0.000357,0.000785,-3.1e-05
2,-0.005078,-0.004393,-0.004111,-0.003656,-0.003105,-0.003829,-0.003032,-0.003601,-0.002521,-0.003594,...,-0.077328,0.026546,0.060656,0.002681,-0.069282,-0.037028,0.020013,-0.044548,-0.061735,0.116505
3,-0.018005,-0.037657,0.995126,0.022412,0.010801,0.011815,0.006589,0.007032,0.003668,0.004728,...,0.000155,0.000445,0.000551,-0.000222,-4.9e-05,0.000248,-0.00015,0.000168,0.000348,-2.1e-05
4,-0.004823,-0.004118,-0.003841,-0.003362,-0.002836,-0.003495,-0.002771,-0.003254,-0.002298,-0.003057,...,-0.006554,-0.007592,-0.005682,0.000393,-0.00321,-0.001607,0.006236,-0.008258,-0.044407,-0.012254


In [143]:
encoder = OneHotEncoder()
# Changed to use test_categorical, a set of variables that doesn't have the 10k+ entry problem mainly for testing
encoded_categorical = encoder.fit_transform(dataset[test_categorical]).toarray()
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(test_categorical))

encoded_categorical_df.head()

Unnamed: 0,playlist_genre_edm,playlist_genre_latin,playlist_genre_pop,playlist_genre_r&b,playlist_genre_rap,playlist_genre_rock,playlist_subgenre_album rock,playlist_subgenre_big room,playlist_subgenre_classic rock,playlist_subgenre_dance pop,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(dataset[numeric_features])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numeric_features)

scaled_numerical_df.head()

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.941531,0.642049,1.201614,0.1732,1.367123,0.876177,-0.481362,-0.333898,-0.377953,-0.80923,0.031908,0.042927
1,0.981557,0.490412,0.643317,1.557627,0.585766,0.876177,-0.688642,-0.46867,-0.359177,1.081061,0.782522,-0.777198
2,1.101635,0.138889,1.284529,-1.211227,1.10009,-1.141322,-0.324422,-0.436799,-0.377849,-0.519562,0.439384,0.116227
3,0.701374,0.435271,1.279002,0.450085,0.984309,0.876177,-0.050024,-0.667642,-0.377911,0.089582,-1.001795,0.039953
4,1.061609,-0.033426,0.742815,-1.211227,0.685151,0.876177,-0.70246,-0.432701,-0.377953,-0.692585,0.919777,0.115037


In [145]:
processed_features_df = pd.concat([artist_reduced_df, scaled_numerical_df, encoded_categorical_df], axis=1)
processed_features_df.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
0,-0.007358,-0.007089,-0.007272,-0.007317,-0.00698,-0.008917,-0.008452,-0.010365,-0.009694,-0.013551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.007523,-0.007312,-0.007554,-0.007719,-0.007418,-0.009581,-0.00912,-0.011466,-0.011251,-0.016087,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.00508,-0.004395,-0.004117,-0.003692,-0.003087,-0.003822,-0.002904,-0.003667,-0.002466,-0.003717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.018005,-0.037657,0.995126,0.022412,0.010803,0.011817,0.006588,0.007035,0.003668,0.004732,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.004823,-0.004119,-0.003843,-0.003355,-0.00284,-0.003489,-0.002793,-0.003244,-0.002244,-0.003097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
# from sklearn.cluster import MiniBatchKMeans
# from sklearn.pipeline import make_pipeline
# from sklearn.decomposition import TruncatedSVD
# from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=30, n_init=10)

kmeans.fit(processed_features_df)
labels = kmeans.labels_
print(f"Cluster Labels: {labels}")
print(kmeans.inertia_)

Cluster Labels: [ 4  4 25 ...  3  7 21]
235868.00527484887
