In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.model_selection import train_test_split
from collections import Counter

data = pd.read_csv('spotify_data/processed_data.csv')

train = data.drop(['genres', 'name'], axis=1)

genres = data['genres']
song_names = data['name']
query = ['Bohemian Rhapsody', 'Baby']

song_names_train = pd.Series([], dtype=str)
while not pd.Series(query, name='name').isin(song_names_train).all():
    X_train, _, genres_train, _, song_names_train, _ = train_test_split(train, genres, song_names, test_size=0.9) 

scaler = StandardScaler()
scaler.fit(train)
train_scale = scaler.transform(X_train)
genres_train = list(genres_train)
song_names_train = list(song_names_train)

In [2]:
models = ['DBSCAN', 'Affinity Propagation', 'Agglomerative Clustering', 'Spectral Clustering', 'Gaussian Mixtures']
model_preds = []
s_scores = []
ch_scores = []
db_scores = []

# DBSCAN

In [3]:
model = DBSCAN(eps=2, min_samples=100)
predictions = model.fit_predict(train_scale)
y_pred = predictions[predictions != -1]
clustered = train_scale[predictions != -1]
outliers = train_scale[predictions == -1]

model_preds.append(predictions)
s_scores.append(silhouette_score(clustered, y_pred))
ch_scores.append(calinski_harabasz_score(clustered, y_pred))
db_scores.append(davies_bouldin_score(clustered, y_pred))

# Affinity Propagation

In [4]:
model = AffinityPropagation(random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Agglomerative Clustering

In [5]:
model = AgglomerativeClustering(n_clusters=8)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Spectral Clustering

In [6]:
model = SpectralClustering(assign_labels='discretize', n_clusters=5, random_state=0)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Gaussian Mixtures

In [7]:
model = GaussianMixture(n_components=6)
y_pred = model.fit_predict(train_scale)

model_preds.append(y_pred)
s_scores.append(silhouette_score(train_scale, y_pred))
ch_scores.append(calinski_harabasz_score(train_scale, y_pred))
db_scores.append(davies_bouldin_score(train_scale, y_pred))

# Comparison

In [8]:
for i in range(len(models)):
    print(models[i])
    print(f'\tSilhouette Score: {s_scores[i]}')
    print(f'\tCalinski Harabasz Score: {ch_scores[i]}')
    print(f'\tDavies Bouldin Score: {db_scores[i]}')

DBSCAN
	Silhouette Score: 0.12790638039831206
	Calinski Harabasz Score: 983.841312562201
	Davies Bouldin Score: 2.38596581303545
Affinity Propagation
	Silhouette Score: 0.07597156380318049
	Calinski Harabasz Score: 121.60870806257314
	Davies Bouldin Score: 1.8093309905878492
Agglomerative Clustering
	Silhouette Score: 0.11611405883426464
	Calinski Harabasz Score: 1278.5006739330572
	Davies Bouldin Score: 1.9220176133814513
Spectral Clustering
	Silhouette Score: 0.24589185166106559
	Calinski Harabasz Score: 524.074345659219
	Davies Bouldin Score: 1.8059335255652775
Gaussian Mixtures
	Silhouette Score: 0.04465889874169526
	Calinski Harabasz Score: 904.6123410309085
	Davies Bouldin Score: 4.034299099698008


# Recommendations

In [11]:
recs_shown = 10

for i in range(len(models)):
    print(models[i])
    for song in query:
        song_index = song_names_train.index(song)
        values = train_scale[song_index]
        label = model_preds[i][song_index]
        dists = np.sqrt(((train_scale - values) ** 2).sum(axis=1))
        dists[model_preds[i] != label] = np.Inf
        recs = np.argsort(dists)[1:recs_shown + 1]
        print(f'\tRecommendations Based On {song}, genres = {genres_train[song_index]}')
        for rec in recs:
            print(f'\t\t{song_names_train[rec]}, genres = {genres_train[rec]}')

DBSCAN
	Recommendations Based On Bohemian Rhapsody, genres = ['rock']
		Bohemian Rhapsody - 2011 Mix, genres = ['rock']
		Siempre Estás Allí, genres = ['latin', 'metal', 'rock']
		White Man, genres = ['rock']
		Cathedral, genres = ['album', 'art', 'blues', 'classic', 'country', 'folk', 'mellow gold', 'psychedelic', 'rock', 'roots', 'soft', 'traditional']
		Russians, genres = ['rock', 'soft', 'wave']
		The Prophet's Song - Remastered 2011, genres = ['rock']
		Rudy, genres = ['album', 'art', 'classic', 'folk', 'mellow gold', 'rock', 'soft']
		Someday My Prince Will Come, genres = ['bebop', 'bop', 'contemporary', 'cool', 'hard', 'jazz']
		Almost Goodbye, genres = ['country', 'rock']
		老情歌, genres = ['classic', 'pop']
	Recommendations Based On Baby, genres = ['indie', 'psychedelic', 'rock']
		My Heart Goes Bum Bum Bum, genres = ['indie', 'pop']
		Where Are You Going, genres = ['pop', 'rock']
		Porpoise Song, genres = ['classic', 'country', 'folk', 'mellow gold', 'pop', 'psychedelic', 'rock