In [1]:
# Import required libraries and dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:

# Load the data into a Pandas DataFrame

df_music_genre = pd.read_csv(
    "Resources/test_data.csv")

# Display sample data

df_music_genre.head(10)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0
5,Curbo,26Y1lX7ZOpw9Ql3gGAlqLK,Debauchery - Original mix,0.00115,0.81,124016,0.417,0.919,9,0.106,-10.783,0,0.0793,120.025,4,0.837,0
6,Bingo Play,5eIyK73BrxHLnly4F9PWqg,Grandma - Original mix,0.000539,0.819,132742,0.72,0.863,4,0.0727,-8.895,0,0.151,124.003,4,0.934,0
7,G Herbo,13Mf2ZBpfNkgWJowvM5hXh,Bon appétit,0.115,0.885,181838,0.348,0.0,9,0.107,-12.569,1,0.451,142.111,4,0.18,0
8,34 Feet,7BQaRTHk44DkMhIVNcXy2D,Among - Original mix,5.8e-05,0.74,124016,0.472,0.847,8,0.0959,-9.008,1,0.0551,120.034,4,0.622,0
9,Chris Cooq,049RxG2laEl9U1PGYeIqLV,Hazard - Original mix,8.1e-05,0.813,132742,0.731,0.91,11,0.0727,-8.932,1,0.0697,124.031,4,0.944,0


In [3]:
# Drop unncessary columns

df_music_genre = df_music_genre.drop(['artist_name', 'track_id', 'track_name', 'duration_ms', ], axis=1)
df_music_genre.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,0.00582,0.743,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,0.0244,0.846,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,0.025,0.603,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,0.0294,0.8,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,3.5e-05,0.783,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [4]:
# Convert data type and drop NA values on tempo column

df_music_genre["tempo"] = pd.to_numeric(df_music_genre["tempo"], errors="coerce")
df_music_genre.dropna(subset=["tempo"], inplace=True)

In [5]:
# check data types
df_music_genre.dtypes


acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
time_signature        int64
valence             float64
popularity            int64
dtype: object

In [6]:
## Use PCA to reduce number of variables

In [7]:
# PCA
from sklearn.decomposition import PCA

In [8]:
pca = PCA(n_components = 2)

In [9]:
genre_pca = pca.fit_transform(df_music_genre)

genre_pca[0:5]

array([[ 83.95435345, -13.14370576],
       [ 38.49472907, -25.8142529 ],
       [ -2.84622507,  32.20928459],
       [  2.31836264, -24.44941577],
       [ -0.53240818, -24.14729703]])

In [10]:
#PCA explained vairance ratio (retained )
pca.explained_variance_ratio_

array([0.67374313, 0.28796162])

In [11]:
#Create PCA DataFrame
genre_pca_df = pd.DataFrame(
    genre_pca,
    columns=["PCA1", "PCA2"]
)

genre_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,83.954353,-13.143706
1,38.494729,-25.814253
2,-2.846225,32.209285
3,2.318363,-24.449416
4,-0.532408,-24.147297


In [16]:
# Incorportate PCA DataFrame into elbow method
inertia = []

k = list(range(1,11))

for i in k:
    k_model = KMeans(n_clusters=i, random_state = 99)
    k_model.fit(genre_pca_df)
    inertia.append(k_model.inertia_)

elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)

df_elbow.head()

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,k,inertia
0,1,170265500.0
1,2,93111890.0
2,3,70746380.0
3,4,49962540.0
4,5,39391410.0


In [17]:
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [18]:
# Define the model with 5 clusters
model = KMeans(n_clusters=5, random_state = 99)

# Fit the model
model.fit(genre_pca_df)

# Make predictions
k_5 = model.predict(genre_pca_df)

# Create a copy of the preprocessed data
df_music_genre_predictions = genre_pca_df.copy()

# Add a genre column with the labels
df_music_genre_predictions['music_genre'] = k_5

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
print(df_music_genre_predictions)

             PCA1       PCA2  music_genre
0       83.954353 -13.143706            2
1       38.494729 -25.814253            2
2       -2.846225  32.209285            4
3        2.318363 -24.449416            1
4       -0.532408 -24.147297            1
...           ...        ...          ...
130658  13.282325  32.471465            4
130659 -23.929170  36.603690            0
130660  38.528865  20.969547            4
130661   1.753003  25.666355            4
130662  12.365979  30.681446            4

[130663 rows x 3 columns]


In [20]:
df_music_genre_predictions.hvplot.scatter(
    x = "PCA1",
    y = "PCA2",
    by = "music_genre"
)

In [21]:
#append music_genre to orginal dataset to illustrate result



In [22]:
# Define the model with 5 clusters
model = KMeans(n_clusters=5, random_state = 99)

# Fit the model
model.fit(df_music_genre)

# Make predictions
k_5 = model.predict(df_music_genre)

# Create a copy of the preprocessed data
df_music_genre_predictions = df_music_genre.copy()

# Add a genre column with the labels
df_music_genre_predictions['music_genre'] = k_5

  super()._check_params_vs_input(X, default_n_init=10)


In [23]:
df_music_genre_predictions.hvplot.scatter(
    x = "popularity",
    y = "instrumentalness",
    by = "music_genre"
)