In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MeanShift, Birch, DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name', 'key', 'mode', 'explicit', 'duration_ms', 'acousticness'], axis=1)

genres = data['genres']
song_names = data['name']
query = ['Bohemian Rhapsody', 'Baby', '13 Preludes, Op. 32: No. 9 in A Major: Allegro moderato']

song_names_train = pd.Series([], dtype=str)
while not pd.Series(query, name='name').isin(song_names_train).all():
    X_train, _, genres_train, _, song_names_train, _ = train_test_split(train, genres, song_names, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)
genres_train = list(genres_train)
song_names_train = list(song_names_train)

In [2]:
models = ['KMeans', 'Mean Shift', 'BIRCH', 'DBSCAN', 'Affinity Propagation', 'Agglomerative Clustering', 'Spectral Clustering', 'Gaussian Mixtures']
model_preds = []
s_scores = []
ch_scores = []
db_scores = []

# KMeans

In [3]:
model = KMeans(n_clusters=5)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Mean Shift

In [4]:
model = MeanShift()
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# BIRCH

In [5]:
model = Birch(n_clusters=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# DBSCAN

In [6]:
model = DBSCAN(eps=1.0, min_samples=30)
predictions = model.fit_predict(train_scale)
y_pred = predictions[predictions != -1]
clustered = train_scale[predictions != -1]
outliers = train_scale[predictions == -1]

model_preds.append(predictions)
s_scores.append(silhouette_score(clustered, y_pred))
ch_scores.append(calinski_harabasz_score(clustered, y_pred))
db_scores.append(davies_bouldin_score(clustered, y_pred))

# Affinity Propagation

In [7]:
model = AffinityPropagation(random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Agglomerative Clustering

In [8]:
model = AgglomerativeClustering(n_clusters=8)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Spectral Clustering

In [9]:
model = SpectralClustering(assign_labels='discretize', n_clusters=5, random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Gaussian Mixtures

In [10]:
model = GaussianMixture(n_components=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Comparison

In [11]:
for i in range(len(models)):
    print(models[i])
    print(f'\tSilhouette Score: {s_scores[i]}')
    print(f'\tCalinski Harabasz Score: {ch_scores[i]}')
    print(f'\tDavies Bouldin Score: {db_scores[i]}')

KMeans
	Silhouette Score: 0.19246702255641956
	Calinski Harabasz Score: 2545.8789432427466
	Davies Bouldin Score: 1.568223793073898
Mean Shift
	Silhouette Score: 0.4953017433245083
	Calinski Harabasz Score: 536.1416740187333
	Davies Bouldin Score: 0.9308533997840107
BIRCH
	Silhouette Score: 0.11697865531141073
	Calinski Harabasz Score: 1896.3469840311488
	Davies Bouldin Score: 1.7532199301266258
DBSCAN
	Silhouette Score: 0.4278191021011381
	Calinski Harabasz Score: 2002.0215615208458
	Davies Bouldin Score: 0.7356272827882732
Affinity Propagation
	Silhouette Score: 0.09641763878136464
	Calinski Harabasz Score: 224.9612246007948
	Davies Bouldin Score: 1.6134302313500088
Agglomerative Clustering
	Silhouette Score: 0.11514069817209811
	Calinski Harabasz Score: 1700.6387540005126
	Davies Bouldin Score: 1.609020988095418
Spectral Clustering
	Silhouette Score: 0.25250526938682266
	Calinski Harabasz Score: 839.8967051575856
	Davies Bouldin Score: 1.2660111721802199
Gaussian Mixtures
	Silhouett

# Recommendations

In [12]:
recs_shown = 10

for i in range(len(models)):
    print(models[i])
    for song in query:
        song_index = song_names_train.index(song)
        values = train_scale[song_index]
        label = model_preds[i][song_index]
        dists = np.sqrt(((train_scale - values) ** 2).sum(axis=1))
        dists[model_preds[i] != label] = np.Inf
        recs = np.argsort(dists)[1:recs_shown + 1]
        print(f'\tRecommendations Based On {song}, genres = {genres_train[song_index]}')
        for rec in recs:
            print(f'\t\t{song_names_train[rec]}, genres = {genres_train[rec]}')

KMeans
	Recommendations Based On Bohemian Rhapsody, genres = ['modern', 'pop', 'rock']
		Donde Estés, Con Quien Estés, genres = ['latin', 'pop']
		The Joke, genres = ['folk', 'indie', 'modern', 'pop', 'rock']
		This Ain't A Love Song, genres = ['metal', 'rock']
		Get It Right (Glee Cast Version), genres = ['pop']
		Beautiful, genres = ['dance', 'pop', 'rap']
		Falling in Love at a Coffee Shop, genres = ['pop']
		Drowning, genres = ['contemporary', 'country', 'modern', 'rock']
		Three Cheers For Five Years, genres = ['pop', 'punk', 'rock']
		I Ain't Going Nowhere Baby, genres = ['contemporary', 'country']
		Como Han Pasado los Años (with Julio Iglesias), genres = ['latin', 'pop']
	Recommendations Based On Baby, genres = ['alternative']
		Good Enough, genres = ['alternative', 'grunge', 'metal']
		Hand-Made, genres = ['indie', 'modern', 'rock']
		Kailangan Ko'y Ikaw, genres = ['classic', 'dance', 'pop', 'rock']
		Cold, genres = ['mellow gold', 'pop', 'rock', 'soft', 'wave']
		We Will Danc