In [1]:
# libraries
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import adjusted_rand_score, silhouette_score


In [None]:
# import data
original_data = pd.read_csv('csv_outputs/cleaned_spotify.csv')
print(original_data["track_genre"].unique())

data = original_data

genre = data["track_genre"]
print(genre.unique())
genre.value_counts().describe()

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

count     114.000000
mean      999.991228
std         0.093659
min       999.000000
25%      1000.000000
50%      1000.000000
75%      1000.000000
max      1000.000000
Name: count, dtype: float64

Note that the data is very evenly distributed when it comes to genre. This may imply that the sampling is not random, and was stratified via the genre subpopulations. This is likely not proportional, which may lead to bias since underrepresented genres are now equally represented with overrepresented genres. Since we have no way to recover original proportions without using external data, we will have to be weary of the results.

In [3]:
# drop text features
string_columns = ['track_id', 'artists', 'album_name', 'track_name']
categorical_columns = ['key', 'mode', 'time_signature']
response_column = 'track_genre'
one_hot_response_columns = [
    x
    for x
    in data[response_column].unique()
]

Y = data[response_column]
X = data.drop(columns=[response_column, *string_columns])
X = pd.get_dummies(X, columns=[*categorical_columns])

In [4]:
X.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
0,73,230666,False,0.676,0.461,-6.746,0.143,0.0322,1e-06,0.358,...,False,False,False,True,False,False,False,False,True,False
1,55,149610,False,0.42,0.166,-17.235,0.0763,0.924,6e-06,0.101,...,False,False,False,False,True,False,False,False,True,False
2,57,210826,False,0.438,0.359,-9.734,0.0557,0.21,0.0,0.117,...,False,False,False,False,True,False,False,False,True,False
3,71,201933,False,0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.132,...,False,False,False,False,True,False,False,True,False,False
4,82,198853,False,0.618,0.443,-9.681,0.0526,0.469,0.0,0.0829,...,False,False,False,False,True,False,False,False,True,False


In [5]:
Y.head()

0    acoustic
1    acoustic
2    acoustic
3    acoustic
4    acoustic
Name: track_genre, dtype: object

# Clustering

We aim to predict genre. We first start to see if there is any obvious clustering.

In [6]:
# sample the training set to speed up hierarchical clustering
sample_size = 0.3
hierarchical_sample_split = StratifiedShuffleSplit(
    n_splits=1,
    test_size=1 - sample_size,
    random_state=42
)
for train_index, _ in hierarchical_sample_split.split(X, Y):
    X_sample = X.iloc[train_index].copy()
    Y_sample = Y.iloc[train_index].copy()
    data_sample = data.iloc[train_index].copy()


In [7]:
n_clusters = 10

kmeans = KMeans(
    n_clusters=n_clusters,
    random_state=0
)
hierarchical = AgglomerativeClustering(
    n_clusters=n_clusters,
    metric='euclidean',
    linkage='ward'
)

hierarchical.fit(X_sample)
kmeans.fit(X_sample)

In [8]:
X_sample["hcluster"] = hierarchical.labels_
X_sample["kcluster"] = kmeans.labels_

In [None]:
# Analyze
hclust_kclust_ari = adjusted_rand_score(X_sample["hcluster"], X_sample["kcluster"])
hclust_y_ari = adjusted_rand_score(X_sample["hcluster"], Y_sample)
kclust_y_ari = adjusted_rand_score(X_sample["kcluster"], Y_sample)

hclust_silhouette = silhouette_score(X_sample, X_sample["hcluster"])
kclust_silhouette = silhouette_score(X_sample, X_sample["kcluster"])

print(f'''
Hierarchical clustering vs KMeans clustering ARI: {hclust_kclust_ari}
Hierarchical clustering vs true labels ARI: {hclust_y_ari}
KMeans clustering vs true labels ARI: {kclust_y_ari}

Hierarchical clustering silhouette score: {hclust_silhouette}
KMeans clustering silhouette score: {kclust_silhouette}
''')


Hierarchical clustering vs KMeans clustering ARI: 0.8463937424905956
Hierarchical clustering vs true labels ARI: 0.00488757812620067
KMeans clustering vs true labels ARI: 0.004976857519320513

Hierarchical clustering silhouette score: 0.5021809562696851
KMeans clustering silhouette score: 0.5173787557708299



In [12]:
pd.set_option('display.max_columns', None)
X_sample.groupby("hcluster").count()

Unnamed: 0_level_0,popularity,duration_ms,explicit,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,kcluster
hcluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471,9471
1,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970,970
2,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746,2746
3,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270,8270
4,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42
5,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686,9686
6,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991,2991
7,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
8,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
9,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13


# Results

The clustering did ok, with silhouette scores for both methods being ~0.5. However, the adjusted rand index when compared with track genre was negligible, meaning that the clusters do not at all correspond to genre. 