In [1]:
# Import required libraries and dependencies
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:

# Load the data into a Pandas DataFrame

df_music_genre = pd.read_csv(
    "Resources/test_data.csv")

# Display sample data

df_music_genre.head(10)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.00582,0.743,238373,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.0244,0.846,214800,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.025,0.603,138913,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.0294,0.8,125381,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.783,124016,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0
5,Curbo,26Y1lX7ZOpw9Ql3gGAlqLK,Debauchery - Original mix,0.00115,0.81,124016,0.417,0.919,9,0.106,-10.783,0,0.0793,120.025,4,0.837,0
6,Bingo Play,5eIyK73BrxHLnly4F9PWqg,Grandma - Original mix,0.000539,0.819,132742,0.72,0.863,4,0.0727,-8.895,0,0.151,124.003,4,0.934,0
7,G Herbo,13Mf2ZBpfNkgWJowvM5hXh,Bon appétit,0.115,0.885,181838,0.348,0.0,9,0.107,-12.569,1,0.451,142.111,4,0.18,0
8,34 Feet,7BQaRTHk44DkMhIVNcXy2D,Among - Original mix,5.8e-05,0.74,124016,0.472,0.847,8,0.0959,-9.008,1,0.0551,120.034,4,0.622,0
9,Chris Cooq,049RxG2laEl9U1PGYeIqLV,Hazard - Original mix,8.1e-05,0.813,132742,0.731,0.91,11,0.0727,-8.932,1,0.0697,124.031,4,0.944,0


In [3]:
# Drop unncessary columns

df_music_genre = df_music_genre.drop(['artist_name', 'track_id', 'track_name', 'duration_ms', ], axis=1)
df_music_genre.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,0.00582,0.743,0.339,0.0,1,0.0812,-7.678,1,0.409,203.927,4,0.118,15
1,0.0244,0.846,0.557,0.0,8,0.286,-7.259,1,0.457,159.009,4,0.371,0
2,0.025,0.603,0.723,0.0,9,0.0824,-5.89,0,0.0454,114.966,4,0.382,56
3,0.0294,0.8,0.579,0.912,5,0.0994,-12.118,0,0.0701,123.003,4,0.641,0
4,3.5e-05,0.783,0.792,0.878,7,0.0332,-10.277,1,0.0661,120.047,4,0.928,0


In [4]:
# Convert data type and drop NA values on tempo column

df_music_genre["tempo"] = pd.to_numeric(df_music_genre["tempo"], errors="coerce")
df_music_genre.dropna(subset=["tempo"], inplace=True)

In [5]:
# check data types
df_music_genre.dtypes


acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
time_signature        int64
valence             float64
popularity            int64
dtype: object

In [6]:
# Scale numeric columns
df_music_genre_scaled = StandardScaler().fit_transform(df_music_genre)

# Create a DataFrame with the scaled data
df_music_genre_transformed = pd.DataFrame(df_music_genre_scaled)

#Display sample data
df_music_genre_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.97408,0.849826,-0.884312,-0.621709,-1.174649,-0.677783,0.350838,0.803395,2.388752,2.800232,0.235251,-1.24144,-0.46715
1,-0.920325,1.391714,-0.046852,-0.621709,0.768345,0.543208,0.414862,0.803395,2.774832,1.310884,0.235251,-0.264902,-1.228065
2,-0.918589,0.113279,0.590848,-0.621709,1.045915,-0.670628,0.62405,-1.244718,-0.535807,-0.14945,0.235251,-0.222443,1.612683
3,-0.905859,1.149706,0.037663,1.909328,-0.064367,-0.569277,-0.32761,-1.244718,-0.337136,0.117033,0.235251,0.777254,-1.228065
4,-0.990817,1.060268,0.855916,1.814969,0.490774,-0.963952,-0.046299,0.803395,-0.36931,0.01902,0.235251,1.885026,-1.228065


In [7]:
inertia = []

k = list(range(1,11))

In [8]:
for i in k:
    k_model = KMeans(n_clusters=i, random_state = 99)
    k_model.fit(df_music_genre_transformed)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
elbow_data = {"k": k, "inertia": inertia}

df_elbow = pd.DataFrame(elbow_data)

df_elbow.head()

Unnamed: 0,k,inertia
0,1,1698619.0
1,2,1381696.0
2,3,1273733.0
3,4,1202424.0
4,5,1140412.0


In [10]:
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [11]:
# Define the model with 8 clusters
model = KMeans(n_clusters=8, random_state = 99)

# Fit the model
model.fit(df_music_genre_transformed)

# Make predictions
k_8 = model.predict(df_music_genre_transformed)

# Create a copy of the preprocessed data
df_music_genre_transformed = df_music_genre.copy()

# Add a genre column with the labels
df_music_genre_transformed['music_genre'] = k_8

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
# Plot the clusters
df_music_genre_transformed.hvplot.scatter(
    x="popularity",
    y = "valence",
    by = "music_genre"
)