In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
import logging
import os
import joblib
from sklearn.decomposition import PCA

# --------------------- Setup --------------------- #
np.random.seed(42)
sns.set(style="whitegrid")
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# --------------------- Load Data --------------------- #
DATA_PATH = "../data/2_cleaned_songs_dataset.csv"
IMAGE_DIR = "../images"
MODEL_DIR = "../models"
OUTPUT_PATH = "../data/3_clustered_songs_dataset.csv"

os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH)
logging.info(f"Data loaded with shape: {df.shape}")

# --------------------- Select & Preprocess Features --------------------- #
# List of features to include, excluding 'len' and other non-useful features
relevant_features = [
    'dating', 'violence', 'world/life', 'night/time', 'shake the audience', 
    'family/gospel', 'romantic', 'communication', 'obscene', 'music', 
    'movement/places', 'light/visual perceptions', 'family/spiritual', 
    'sadness', 'feelings', 'topic'
]

# Check if features are numeric
logging.info("Checking if features are numeric:")
for col in relevant_features:
    if not pd.api.types.is_numeric_dtype(df[col]):
        logging.info(f"Column '{col}' is not numeric.")
    else:
        logging.info(f"Column '{col}' is numeric.")

# Convert non-numeric columns to numeric if necessary (for example, encoding categorical data)
# If you have any categorical variables, you should encode them. Here we use label encoding as an example:

for col in relevant_features:
    if not pd.api.types.is_numeric_dtype(df[col]):
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        logging.info(f"Column '{col}' has been encoded.")

# Filter out only relevant features from the dataframe
features = df[relevant_features]

# Handle missing values by filling with column means
features.fillna(features.mean(), inplace=True)

# Scale the features
scaler = StandardScaler().fit(features)
scaled_features = scaler.transform(features)

# --------------------- Dimensionality Reduction --------------------- #
# PCA can be used to reduce dimensionality for better performance and visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)
logging.info(f"PCA applied: Explained variance = {np.sum(pca.explained_variance_ratio_):.2f}")

# --------------------- Silhouette Score Evaluation for KMeans --------------------- #
k_values = range(2, 11)
silhouette_scores = []

logging.info("Evaluating silhouette scores for KMeans...")
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_features)
    score = silhouette_score(reduced_features, kmeans.labels_)
    silhouette_scores.append(score)
    logging.info(f"Silhouette Score for k={k}: {score:.4f}")

# Plot Silhouette Scores
plt.figure(figsize=(8, 5))
plt.plot(k_values, silhouette_scores, marker='o', color='orange')
plt.title("Silhouette Scores for KMeans Clustering")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.tight_layout()
plt.savefig(os.path.join(IMAGE_DIR, "3.1_silhouette_scores.png"))
plt.show()

# Optimal number of clusters based on silhouette score
optimal_k = k_values[np.argmax(silhouette_scores)]
logging.info(f"Optimal number of clusters (KMeans) based on silhouette score: {optimal_k}")

# --------------------- Advanced Clustering Techniques --------------------- #
# 1. DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
dbscan = DBSCAN(eps=0.5, min_samples=5)
df["dbscan_cluster"] = dbscan.fit_predict(reduced_features)
logging.info("DBSCAN clustering completed.")
logging.info(f"Number of clusters found by DBSCAN: {len(np.unique(df['dbscan_cluster'])) - (1 if -1 in df['dbscan_cluster'].values else 0)}")

# 2. Agglomerative Clustering
agg_clust = AgglomerativeClustering(n_clusters=optimal_k)
df["agg_cluster"] = agg_clust.fit_predict(reduced_features)
logging.info("Agglomerative Clustering completed.")

# --------------------- Apply Optimal KMeans --------------------- #
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df["cluster"] = kmeans.fit_predict(reduced_features)

# Save the model and scaler
joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
joblib.dump(kmeans, os.path.join(MODEL_DIR, "kmeans_model.pkl"))
joblib.dump(dbscan, os.path.join(MODEL_DIR, "dbscan_model.pkl"))
joblib.dump(agg_clust, os.path.join(MODEL_DIR, "agg_clust_model.pkl"))

# Save clustered data
df.to_csv(OUTPUT_PATH, index=False)
logging.info(f"Clustered data saved to: {OUTPUT_PATH}")

# --------------------- Visualize Clusters --------------------- #
# Visualize in 2D using PCA-reduced features
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["cluster"], palette="viridis", alpha=0.7)
plt.title("Clusters Based on PCA-Reduced Features (KMeans)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.2_song_clusters_kmeans.png"))
plt.show()

# Visualize DBSCAN Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["dbscan_cluster"], palette="coolwarm", alpha=0.7)
plt.title("DBSCAN Clusters")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.3_song_clusters_dbscan.png"))
plt.show()

# Visualize Agglomerative Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["agg_cluster"], palette="Set2", alpha=0.7)
plt.title("Agglomerative Clusters")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.4_song_clusters_agg.png"))
plt.show()

# --------------------- Cluster Summaries --------------------- #
for cluster in range(optimal_k):
    cluster_data = df[df["cluster"] == cluster]
    logging.info(f"\nSummary for Cluster {cluster}:")
    logging.info(cluster_data.describe())

# Additional DBSCAN and Agglomerative cluster summaries
logging.info(f"\nSummary for DBSCAN clusters:")
logging.info(df.groupby("dbscan_cluster").describe())

logging.info(f"\nSummary for Agglomerative clusters:")
logging.info(df.groupby("agg_cluster").describe())


KeyError: 'world/life'

In [50]:
# We will start off by importing our libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
import joblib
from sklearn.decomposition import PCA
import os
import logging

In [51]:
# We will set up our paths and load our data
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

np.random.seed(42)
sns.set(style="whitegrid")

DATA_PATH = "../data/2_cleaned_songs_dataset.csv"
IMAGE_DIR = "../images"
MODEL_DIR = "../models"
OUTPUT_PATH = "../data/3_clustered_songs_dataset.csv"

df = pd.read_csv(DATA_PATH)
print(f"The data has been loaded with shape: {df.shape}")

The data has been loaded with shape: (28362, 22)


In [52]:
# Print the columns of the DataFrame to check what is available
logging.info("Available columns in the DataFrame:")
logging.info(df.columns.tolist())

In [55]:
# We will select and preprocess the features
relevant_features = [
    'dating', 'violence', 'world/life', 'night/time', 'shake the audience', 
    'family/gospel', 'romantic', 'communication', 'obscene', 'music', 
    'movement/places', 'light/visual perceptions', 'family/spiritual', 
    'sadness', 'feelings', 'topic'
]

# We will check if features are numeric
logging.info("Checking if features are numeric:")
for col in relevant_features:
    if not pd.api.types.is_numeric_dtype(df[col]):
        logging.info(f"Column '{col}' is not numeric.")
    else:
        logging.info(f"Column '{col}' is numeric.")

# Convert non-numeric columns to numeric if necessary 
for col in relevant_features:
    if not pd.api.types.is_numeric_dtype(df[col]):
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        logging.info(f"Column '{col}' has been encoded.")

# Filter out only relevant features from the dataframe
features = df[relevant_features]

# Handle missing values by filling with column means
features.fillna(features.mean(), inplace=True)

# Scale the features
scaler = StandardScaler().fit(features)
scaled_features = scaler.transform(features)

KeyError: 'world/life'

In [None]:
# We will perform dimensionality reduction
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)
print(f"The PCA has been applied: Explained variance = {np.sum(pca.explained_variance_ratio_):.2f}")

In [None]:
# We will obtain the Silhouette Score for KMeans
k_values = range(2, 11)
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_features)
    score = silhouette_score(reduced_features, kmeans.labels_)
    silhouette_scores.append(score)
    print(f"Silhouette Score for k={k}: {score:.4f}")

# We will plot the silhouette scores
plt.figure(figsize=(8, 5))
plt.plot(k_values, silhouette_scores, marker='o', color='orange')
plt.title("Silhouette Scores for KMeans Clustering")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.tight_layout()
plt.savefig(os.path.join(IMAGE_DIR, "3.1_silhouette_scores.png"))
plt.show()

# We will find the optimal number of clusters based on the silhouette score
optimal_k = k_values[np.argmax(silhouette_scores)]
print(f"The optimal number of clusters (KMeans) based on silhouette score: {optimal_k}")

In [None]:
# We will perform advanced Clustering Techniques: DBSCAN & Agglomerative Clustering 
dbscan = DBSCAN(eps=0.5, min_samples=5)
df["dbscan_cluster"] = dbscan.fit_predict(reduced_features)
print(f"DBSCAN clustering completed. Number of clusters found: {len(np.unique(df['dbscan_cluster'])) - (1 if -1 in df['dbscan_cluster'].values else 0)}")

agg_clust = AgglomerativeClustering(n_clusters=optimal_k)
df["agg_cluster"] = agg_clust.fit_predict(reduced_features)

In [None]:
# We will apply Optimal KMeans, save the model, scaler and clustered data
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df["cluster"] = kmeans.fit_predict(reduced_features)

joblib.dump(scaler, os.path.join(MODEL_DIR, "scaler.pkl"))
joblib.dump(kmeans, os.path.join(MODEL_DIR, "kmeans_model.pkl"))
joblib.dump(dbscan, os.path.join(MODEL_DIR, "dbscan_model.pkl"))
joblib.dump(agg_clust, os.path.join(MODEL_DIR, "agg_clust_model.pkl"))

df.to_csv(OUTPUT_PATH, index=False)
print(f"Clustered data saved to: {OUTPUT_PATH}")

In [None]:
# We will visualize the clusters: KMeans, DBSCAN and Agglomerative Clusters
# KMeans
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["cluster"], palette="viridis", alpha=0.7)
plt.title("Clusters Based on PCA-Reduced Features (KMeans)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.2_song_clusters_kmeans.png"))
plt.show()

# DBSCAN
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["dbscan_cluster"], palette="coolwarm", alpha=0.7)
plt.title("DBSCAN Clusters")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.3_song_clusters_dbscan.png"))
plt.show()

# Agglomerative
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_features[:, 0], y=reduced_features[:, 1], hue=df["agg_cluster"], palette="Set2", alpha=0.7)
plt.title("Agglomerative Clusters")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend(title="Cluster")
plt.savefig(os.path.join(IMAGE_DIR, "3.4_song_clusters_agg.png"))
plt.show()

In [None]:
# We will print summaries of all 3 clustering methods
for cluster in range(optimal_k):
    cluster_data = df[df["cluster"] == cluster]
    print(f"\nSummary for Cluster {cluster}:")
    print(cluster_data.describe())

print("\nSummary for DBSCAN clusters:")
print(df.groupby("dbscan_cluster").describe())

print("\nSummary for Agglomerative clusters:")
print(df.groupby("agg_cluster").describe())