In [2]:
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd 

In [3]:
filePath = "data/data.csv" 
spotify_df = pd.read_csv(filePath)
spotify_df = spotify_df.set_index("id")
spotify_df.head(5)

Unnamed: 0_level_0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0cS0A1fUEUd1EW3FcF8AEI,0.991,['Mamie Smith'],0.598,168333,0.224,0,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
0hbkKFIJm7Z05H8Zl9w30f,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
11m7laMUgmOKqI3oYzuhne,0.993,['Mamie Smith'],0.647,163827,0.186,0,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
19Lc5SfJJ5O1oaxY0fpwfh,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
2hJjbsLCytGsnAHfdsLejp,0.295,['Mixe'],0.704,165224,0.707,1,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [4]:
spotify_df_features = spotify_df[["acousticness", "danceability", "duration_ms", "energy", "instrumentalness", "key", "liveness", "loudness", "speechiness", "tempo", "valence"]]

In [5]:
scaler = StandardScaler()
spotify_scaler = scaler.fit(spotify_df_features) # save later 
spotify_df_scaled = spotify_scaler.transform(spotify_df_features)

In [6]:
run_analysis = False 

In [7]:
kmeans_kwargs = {
    "init": "k-means++",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42
}

if run_analysis: 
    n_cluster_start = 2
    n_cluster_end = 25 

    sse = []
    silhouette_coefficients = []
    for i in range(n_cluster_start, n_cluster_end): 
        kmeans = KMeans(n_clusters=i, **kmeans_kwargs)
        kmeans.fit(spotify_df_scaled)
        sse.append(kmeans.inertia_)
        score = silhouette_score(spotify_df_scaled, kmeans.labels_) 
        silhouette_coefficients.append(score)

In [8]:
if run_analysis:
    plt.plot(range(n_cluster_start, n_cluster_end), sse)
    plt.xticks(range(n_cluster_start, n_cluster_end))
    plt.xlabel("Number of Clusters")
    plt.ylabel("SSE")
    plt.show()

In [9]:
if run_analysis:
    kl = KneeLocator(range(n_cluster_start,n_cluster_end), sse, curve="convex", direction="decreasing")
    kl.elbow

In [10]:
if run_analysis:
    plt.plot(range(n_cluster_start, n_cluster_end), silhouette_coefficients)
    plt.xticks(range(n_cluster_start, n_cluster_end))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Coefficient")
    plt.show()

In [11]:
# create model from elbow 
kmeans = KMeans(n_clusters=8, **kmeans_kwargs)
kmeans.fit(spotify_df_scaled)

KMeans(random_state=42)

In [14]:
kmeans.cluster_centers_

array([[-2.32853061e-01, -2.42400786e-01,  1.52261197e-01,
         5.35387905e-01, -1.45934194e-01, -1.47997300e-02,
         2.82257502e+00,  3.26535004e-01, -1.61513531e-02,
         9.10415137e-02,  3.15840858e-02],
       [ 1.05136118e+00, -9.47743376e-01,  1.95347833e-01,
        -1.12005365e+00,  1.87158471e+00, -6.73146026e-02,
        -2.55240481e-01, -1.38180066e+00, -2.92207516e-01,
        -5.04995042e-01, -9.30132635e-01],
       [-1.03179305e+00, -3.78117009e-01,  1.43696795e-01,
         1.00310703e+00,  5.21922691e-02,  2.15839026e-03,
        -1.29872725e-01,  7.45893965e-01, -1.73898560e-01,
         7.02488065e-01, -3.79724620e-01],
       [ 6.67359034e-02,  8.12011474e-01, -5.11619301e-01,
        -8.79737446e-01, -5.61223919e-01, -4.38370055e-03,
         3.94658673e-01, -1.35896170e+00,  4.38950247e+00,
        -3.20705052e-01,  6.61749873e-02],
       [ 7.78248852e-01,  4.00795177e-01, -3.20940500e-01,
        -3.95192778e-01, -7.22576133e-02, -2.82858642e-02,
  

In [17]:
labels = kmeans.predict(spotify_df_scaled)

In [18]:
spotify_df["label"] = labels
spotify_df

Unnamed: 0_level_0,acousticness,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0cS0A1fUEUd1EW3FcF8AEI,0.991000,['Mamie Smith'],0.598,168333,0.224,0,0.000522,5,0.3790,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.6340,1920,4
0hbkKFIJm7Z05H8Zl9w30f,0.643000,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0.026400,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.9500,1920,7
11m7laMUgmOKqI3oYzuhne,0.993000,['Mamie Smith'],0.647,163827,0.186,0,0.000018,0,0.5190,-12.098,1,Golfing Papa,4,1920,0.1740,97.600,0.6890,1920,4
19Lc5SfJJ5O1oaxY0fpwfh,0.000173,['Oscar Velazquez'],0.730,422087,0.798,0,0.801000,2,0.1280,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920,2
2hJjbsLCytGsnAHfdsLejp,0.295000,['Mixe'],0.704,165224,0.707,1,0.000246,10,0.4020,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.2990,1920,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46LhBf6TvYjZU2SMvGZAbn,0.009170,"['DJ Combo', 'Sander-7', 'Tony T']",0.792,147615,0.866,0,0.000060,6,0.1780,-5.089,0,The One,0,2020-12-25,0.0356,125.972,0.1860,2020,2
7tue2Wemjd0FZzRtDrQFZd,0.795000,['Alessia Cara'],0.429,144720,0.211,0,0.000000,4,0.1960,-11.665,1,A Little More,0,2021-01-22,0.0360,94.710,0.2280,2021,5
48Qj61hOdYmUCFJbpQ29Ob,0.806000,['Roger Fly'],0.671,218147,0.589,0,0.920000,4,0.1130,-12.393,0,Together,0,2020-12-09,0.0282,108.058,0.7140,2020,4
1gcyHQpBQ1lfXGdhZmWrHP,0.920000,['Taylor Swift'],0.462,244000,0.240,1,0.000000,0,0.1130,-12.077,1,champagne problems,69,2021-01-07,0.0377,171.319,0.3200,2021,5


In [20]:
# save labeled df to csv 
spotify_df.to_csv("data/spotify_data_labeled.csv")

In [13]:
from joblib import dump, load
# export scaler 
dump(spotify_scaler, "model/scaler.joblib")
# export model
dump(kmeans, "model/kmeans.joblib")

['model/kmeans.joblib']