# CMPE: 256 Advanced Data Mining 
## Spotify Music Recommendation System
## Approach: DBSCAN

#### Importing all the necessary libraries 

In [4]:
#importing all the libraries 
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics


#### Data Preprocessing 

In [5]:
#Loading the dataset 
data = pd.read_csv('tracks_features.csv')
#data = data.sample(frac=0.9)

In [6]:
#Extracting the features
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X = data[features]



In [7]:
#Taking care of missing values 
X_filled = X.fillna(X.mean())


In [8]:
#Normalizing the features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#### Principal Component Analysis 

Reducing the dimentionality of the dataset 

In [9]:
#Principal Component Analysis
n_components = 7
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)


#### DBSCAN Clustering 
Fitting the clustering model on the feature data set

In [10]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=10)
clusters = dbscan.fit_predict(X_pca)


In [11]:
data['cluster'] = clusters

#### Finding Recommendations 

Here, we are finding the recommendations on a randomly selected track_id and trying to evaluate and rank the recommended tracks on the basis of cosine similarity

In [12]:
# Select a random track_id from the dataset as input
input_song = data.sample(1)
input_track_id = input_song['id'].values[0]
input_track_id

In [13]:
# Extract and preprocess the features of the input song
input_song_features = input_song[features]
input_song_features_filled = input_song_features.fillna(input_song_features.mean())
input_song_features_scaled = scaler.transform(input_song_features_filled)


In [14]:
#the cluster of the input song
input_song_cluster = input_song['cluster'].iloc[0]

In [15]:
# Get initial recommendations from the same cluster
recommendations = data[data['cluster'] == input_song_cluster].sample(200)


In [16]:
# Extract and preprocess the features of the recommended songs
recommended_ids = recommendations['id'].values
recommended_features = data[data['id'].isin(recommended_ids)][features]
recommended_features_filled = recommended_features.fillna(recommended_features.mean())
recommended_features_scaled = scaler.transform(recommended_features_filled)


In [17]:
# Apply PCA to the input song and recommended songs
input_song_features_pca = pca.transform(input_song_features_scaled)
recommended_features_pca = pca.transform(recommended_features_scaled)


In [18]:
# Calculate cosine similarity between the input song and recommended songs
cosine_similarities = cosine_similarity(input_song_features_pca, recommended_features_pca)


In [20]:
# Re-rank the recommendations based on their cosine similarity to the input song
recommendations['cosine_similarity'] = cosine_similarities.flatten()
recommendations_sorted = recommendations.sort_values(by='cosine_similarity', ascending=False)


In [21]:
# Filter the recommendations based on cosine similarity threshold
cosine_similarity_threshold = 0.7
filtered_recommendations = recommendations_sorted[recommendations_sorted['cosine_similarity'] > cosine_similarity_threshold]

In [22]:
print("Input song:")
print(input_song[['id', 'name', 'artists', 'album']])
print("\nRecommended songs:")
print(filtered_recommendations[['id', 'name', 'artists', 'album', 'cosine_similarity']])

Input song:
                             id            name             artists  \
1047344  48GyQrDhrA8GBGCJukg7cP  Proud This Day  ['Randy Peterson']   

               album  
1047344  Picture Day  

Recommended songs:
                             id  \
1014240  4Pv9VLOPdH0IjknxFL69JZ   
774907   2KRGsBdu4uiAT4MTXT0dCs   
378760   6EQ6SWDugGb4fRlXqFThfU   
475535   6q1NIOiUHixKUzJgifrXXu   
776488   4zFdL5rG7gg98eQgHFzicw   
116502   4ErPrvGKlfKieRd0VR10CB   
729302   2wYFJoH48CvUBWE3e8FsBo   
889662   6UXAw8nv7Tbn2X461qQ7Zd   
24359    5wFOJPmF5seOFtvvbqElZL   
544424   4trdc8XKiPHbVlGHiFZYTH   
1109804  0NWaEs60T2O8clNlDPuEE7   
862705   2GKsSI0ut4jUshkRDdq5In   
819016   0kUuGMQIld15gmnYguh6cx   
43220    7qdcE5qU3yXpbEkc6oRNQX   

                                                  name  \
1014240                         Never Enough (For You)   
774907                        Lord Keep Me With A Mind   
378760                          Lads of Wamphray March   
475535               

In [24]:
# Exclude noise points
non_noise_indices = clusters != -1
X_pca_no_noise = X_pca[non_noise_indices]
clusters_no_noise = clusters[non_noise_indices]

# Compute metrics
#silhouette_score = metrics.silhouette_score(X_pca_no_noise, clusters_no_noise)
davies_bouldin_score = metrics.davies_bouldin_score(X_pca_no_noise, clusters_no_noise)
calinski_harabasz_score = metrics.calinski_harabasz_score(X_pca_no_noise, clusters_no_noise)

#print("Silhouette Score (excluding noise points):", silhouette_score)
print("Davies-Bouldin Index (excluding noise points):", davies_bouldin_score)
print("Calinski-Harabasz Index (excluding noise points):", calinski_harabasz_score)


Davies-Bouldin Index (excluding noise points): 1.0215942595737155
Calinski-Harabasz Index (excluding noise points): 573.4611955534367
