In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MeanShift, Birch, DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name', 'key', 'mode', 'explicit', 'duration_ms'], axis=1)

genres = data['genres']
song_names = data['name']
query = ['Bohemian Rhapsody', 'Baby', '13 Preludes, Op. 32: No. 9 in A Major: Allegro moderato']

song_names_train = pd.Series([], dtype=str)
while not pd.Series(query, name='name').isin(song_names_train).all():
    X_train, _, genres_train, _, song_names_train, _ = train_test_split(train, genres, song_names, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)
genres_train = list(genres_train)
song_names_train = list(song_names_train)

In [2]:
models = ['KMeans', 'Mean Shift', 'BIRCH', 'DBSCAN', 'Affinity Propagation', 'Agglomerative Clustering', 'Spectral Clustering', 'Gaussian Mixtures']
model_preds = []
s_scores = []
ch_scores = []
db_scores = []

# KMeans

In [3]:
model = KMeans(n_clusters=5)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Mean Shift

In [4]:
model = MeanShift()
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# BIRCH

In [5]:
model = Birch(n_clusters=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# DBSCAN

In [6]:
model = DBSCAN(eps=1.1, min_samples=30)
predictions = model.fit_predict(train_scale)
y_pred = predictions[predictions != -1]
clustered = train_scale[predictions != -1]
outliers = train_scale[predictions == -1]

model_preds.append(predictions)
s_scores.append(silhouette_score(clustered, y_pred))
ch_scores.append(calinski_harabasz_score(clustered, y_pred))
db_scores.append(davies_bouldin_score(clustered, y_pred))

# Affinity Propagation

In [7]:
model = AffinityPropagation(random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Agglomerative Clustering

In [8]:
model = AgglomerativeClustering(n_clusters=8)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Spectral Clustering

In [9]:
model = SpectralClustering(assign_labels='discretize', n_clusters=5, random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Gaussian Mixtures

In [10]:
model = GaussianMixture(n_components=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Comparison

In [11]:
for i in range(len(models)):
    print(models[i])
    print(f'\tSilhouette Score: {s_scores[i]}')
    print(f'\tCalinski Harabasz Score: {ch_scores[i]}')
    print(f'\tDavies Bouldin Score: {db_scores[i]}')

KMeans
	Silhouette Score: 0.1589666117619654
	Calinski Harabasz Score: 2707.7357025284164
	Davies Bouldin Score: 1.7999845058766095
Mean Shift
	Silhouette Score: 0.3353460244716669
	Calinski Harabasz Score: 218.34914860198083
	Davies Bouldin Score: 1.7993264197456846
BIRCH
	Silhouette Score: 0.12110591132386934
	Calinski Harabasz Score: 1963.3641738392346
	Davies Bouldin Score: 1.971576610391206
DBSCAN
	Silhouette Score: 0.3808851056971384
	Calinski Harabasz Score: 2017.006058216331
	Davies Bouldin Score: 0.8352845457496735
Affinity Propagation
	Silhouette Score: 0.09253461111625143
	Calinski Harabasz Score: 212.86266780713402
	Davies Bouldin Score: 1.6513249976529818
Agglomerative Clustering
	Silhouette Score: 0.08958472421176161
	Calinski Harabasz Score: 1725.9693885415609
	Davies Bouldin Score: 1.8601780431213153
Spectral Clustering
	Silhouette Score: 0.25588614056313874
	Calinski Harabasz Score: 833.8494413700105
	Davies Bouldin Score: 1.4872743811449018
Gaussian Mixtures
	Silhouet

# Recommendations

In [12]:
recs_shown = 10

for i in range(len(models)):
    print(models[i])
    for song in query:
        song_index = song_names_train.index(song)
        values = train_scale[song_index]
        label = model_preds[i][song_index]
        dists = np.sqrt(((train_scale - values) ** 2).sum(axis=1))
        dists[model_preds[i] != label] = np.Inf
        recs = np.argsort(dists)[1:recs_shown + 1]
        print(f'\tRecommendations Based On {song}, genres = {genres_train[song_index]}')
        for rec in recs:
            print(f'\t\t{song_names_train[rec]}, genres = {genres_train[rec]}')

KMeans
	Recommendations Based On Bohemian Rhapsody, genres = ['modern', 'pop', 'rock']
		Look After You, genres = ['modern', 'pop', 'rock']
		East To West, genres = ['alternative', 'rock']
		Bored, genres = ['alternative', 'grunge', 'indie', 'metal', 'modern', 'rap', 'rock']
		It Ends Tonight, genres = ['grunge', 'modern', 'pop', 'punk', 'rock']
		Born To Die, genres = ['art', 'pop']
		Until We Meet Again, genres = ['dance', 'pop']
		Princess of China, genres = ['pop', 'wave']
		Mi soledad y yo, genres = ['latin', 'pop', 'rock']
		Eventually, genres = ['psychedelic']
		I Feel Pretty / Unpretty (Glee Cast Version), genres = ['pop']
	Recommendations Based On Baby, genres = ['funk', 'rock']
		Blood Of Eden, genres = ['album', 'art', 'classic', 'dance', 'mellow gold', 'pop', 'rock', 'soft', 'wave']
		Skulls, genres = ['alternative', 'pop', 'rock']
		Namaste - Alternate Mix / No Vocals, genres = ['alternative', 'hip hop', 'rap', 'rock']
		You're My Latest, My Greatest Inspiration, genres = 