In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, MeanShift, Birch, DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name'], axis=1)

genres = data['genres']
song_names = data['name']
query = ['Bohemian Rhapsody', 'Baby', '13 Preludes, Op. 32: No. 9 in A Major: Allegro moderato']

song_names_train = pd.Series([], dtype=str)
while not pd.Series(query, name='name').isin(song_names_train).all():
    X_train, _, genres_train, _, song_names_train, _ = train_test_split(train, genres, song_names, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)
genres_train = list(genres_train)
song_names_train = list(song_names_train)

In [2]:
models = ['KMeans', 'Mean Shift', 'BIRCH', 'DBSCAN', 'Affinity Propagation', 'Agglomerative Clustering', 'Spectral Clustering', 'Gaussian Mixtures']
model_preds = []
s_scores = []
ch_scores = []
db_scores = []

# KMeans

In [3]:
model = KMeans(n_clusters=5)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Mean Shift

In [4]:
model = MeanShift()
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# BIRCH

In [5]:
model = Birch(n_clusters=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# DBSCAN

In [6]:
model = DBSCAN(eps=2, min_samples=100)
predictions = model.fit_predict(train_scale)
y_pred = predictions[predictions != -1]
clustered = train_scale[predictions != -1]
outliers = train_scale[predictions == -1]

model_preds.append(predictions)
s_scores.append(silhouette_score(clustered, y_pred))
ch_scores.append(calinski_harabasz_score(clustered, y_pred))
db_scores.append(davies_bouldin_score(clustered, y_pred))

# Affinity Propagation

In [7]:
model = AffinityPropagation(random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Agglomerative Clustering

In [8]:
model = AgglomerativeClustering(n_clusters=8)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Spectral Clustering

In [9]:
model = SpectralClustering(assign_labels='discretize', n_clusters=5, random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Gaussian Mixtures

In [10]:
model = GaussianMixture(n_components=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Comparison

In [11]:
for i in range(len(models)):
    print(models[i])
    print(f'\tSilhouette Score: {s_scores[i]}')
    print(f'\tCalinski Harabasz Score: {ch_scores[i]}')
    print(f'\tDavies Bouldin Score: {db_scores[i]}')

KMeans
	Silhouette Score: 0.15345898200660582
	Calinski Harabasz Score: 1968.1689939266016
	Davies Bouldin Score: 1.8711545161266645
Mean Shift
	Silhouette Score: 0.38753720227926863
	Calinski Harabasz Score: 79.70983647376192
	Davies Bouldin Score: 0.9827896211755102
BIRCH
	Silhouette Score: 0.126736217224024
	Calinski Harabasz Score: 1480.0301333281748
	Davies Bouldin Score: 2.1586460524901403
DBSCAN
	Silhouette Score: 0.12248537581060082
	Calinski Harabasz Score: 982.4936556716705
	Davies Bouldin Score: 2.3660689732080593
Affinity Propagation
	Silhouette Score: 0.07835572148166468
	Calinski Harabasz Score: 126.21890033715945
	Davies Bouldin Score: 1.7775410658962905
Agglomerative Clustering
	Silhouette Score: 0.12002625841994694
	Calinski Harabasz Score: 1294.1380050098787
	Davies Bouldin Score: 1.9867371947796364
Spectral Clustering
	Silhouette Score: 0.2427430871421291
	Calinski Harabasz Score: 455.780323194039
	Davies Bouldin Score: 0.9964163726031356
Gaussian Mixtures
	Silhouett

# Recommendations

In [12]:
recs_shown = 10

for i in range(len(models)):
    print(models[i])
    for song in query:
        song_index = song_names_train.index(song)
        values = train_scale[song_index]
        label = model_preds[i][song_index]
        dists = np.sqrt(((train_scale - values) ** 2).sum(axis=1))
        dists[model_preds[i] != label] = np.Inf
        recs = np.argsort(dists)[1:recs_shown + 1]
        print(f'\tRecommendations Based On {song}, genres = {genres_train[song_index]}')
        for rec in recs:
            print(f'\t\t{song_names_train[rec]}, genres = {genres_train[rec]}')

KMeans
	Recommendations Based On Bohemian Rhapsody, genres = ['modern', 'pop', 'rock']
		Goodbye to You, genres = ['dance', 'pop', 'rock']
		Solitude, genres = ['hard', 'metal', 'rock']
		A Quién Quiero Mentirle, genres = ['latin', 'modern', 'pop']
		Flying Without Wings, genres = ['dance', 'pop']
		Three Cheers For Five Years, genres = ['pop', 'punk', 'rock']
		El Anillo del Capitán Beto, genres = ['alternative', 'funk', 'indie', 'latin', 'pop', 'rock']
		Don't Let Me Down (feat. Daya) - Illenium Remix, genres = ['dance', 'pop']
		Free As A Bird - Anthology 1 Version, genres = ['classic', 'psychedelic', 'rock']
		Big Eyed Fish, genres = ['pop', 'rock']
		We're All In This Together (Graduation Mix) - Original Version, genres = ['dance', 'pop']
	Recommendations Based On Baby, genres = ['rock', 'soul']
		Come And Get These Memories, genres = ['classic', 'funk', 'pop', 'soul']
		Gospel Plow, genres = ['album', 'classic', 'country', 'folk', 'mellow gold', 'rock', 'roots', 'singer songwrite