# Unsupervised Learning - Clustering

This notebook covers:
- K-Means clustering with elbow method
- Hierarchical clustering with dendrogram
- Cluster analysis and visualization
- Comparison with actual labels


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the data
X_selected = pd.read_csv('data/X_selected.csv')
y = pd.read_csv('data/y_target.csv').values.ravel()

print("Data loaded successfully!")
print(f"Features shape: {X_selected.shape}")
print(f"Target shape: {y.shape}")
print(f"Selected features: {list(X_selected.columns)}")

# Standardize the features for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_selected.columns)

print(f"\nData standardized for clustering")
print(f"Scaled features shape: {X_scaled_df.shape}")


In [None]:
# 1. K-Means Clustering with Elbow Method
print("1. K-Means Clustering Analysis:")
print("=" * 30)

# Test different numbers of clusters
k_range = range(2, 11)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, cluster_labels))

# Plot elbow method and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Elbow method
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method for Optimal k')
axes[0].grid(True, alpha=0.3)

# Silhouette scores
axes[1].plot(k_range, silhouette_scores, 'ro-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score vs Number of Clusters')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters (highest silhouette score): {optimal_k}")
print(f"Silhouette score: {max(silhouette_scores):.4f}")

# Apply K-Means with optimal k
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans_optimal.fit_predict(X_scaled)

print(f"\nK-Means clustering completed with {optimal_k} clusters")
print(f"Cluster distribution: {np.bincount(kmeans_labels)}")


In [None]:
# 2. Hierarchical Clustering
print("\n2. Hierarchical Clustering Analysis:")
print("=" * 35)

# Create linkage matrix
linkage_matrix = linkage(X_scaled, method='ward')

# Plot dendrogram
plt.figure(figsize=(15, 8))
dendrogram(linkage_matrix, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()

# Apply hierarchical clustering with optimal k
hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
hierarchical_labels = hierarchical.fit_predict(X_scaled)

print(f"Hierarchical clustering completed with {optimal_k} clusters")
print(f"Cluster distribution: {np.bincount(hierarchical_labels)}")

# Calculate silhouette scores
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
hierarchical_silhouette = silhouette_score(X_scaled, hierarchical_labels)

print(f"\nSilhouette Scores:")
print(f"K-Means: {kmeans_silhouette:.4f}")
print(f"Hierarchical: {hierarchical_silhouette:.4f}")


In [None]:
# 3. Cluster Analysis and Visualization
print("\n3. Cluster Analysis:")
print("=" * 20)

# Compare clusters with actual labels
kmeans_ari = adjusted_rand_score(y, kmeans_labels)
hierarchical_ari = adjusted_rand_score(y, hierarchical_labels)

print(f"Adjusted Rand Index (comparison with actual labels):")
print(f"K-Means: {kmeans_ari:.4f}")
print(f"Hierarchical: {hierarchical_ari:.4f}")

# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Use PCA for 2D visualization
from sklearn.decomposition import PCA
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# K-Means clusters
scatter1 = axes[0, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0, 0].set_title('K-Means Clusters')
axes[0, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.3f})')
axes[0, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.3f})')

# Hierarchical clusters
scatter2 = axes[0, 1].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.6)
axes[0, 1].set_title('Hierarchical Clusters')
axes[0, 1].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.3f})')
axes[0, 1].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.3f})')

# Actual labels
scatter3 = axes[1, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y, cmap='viridis', alpha=0.6)
axes[1, 0].set_title('Actual Labels')
axes[1, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.3f})')
axes[1, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.3f})')

# Cluster comparison
axes[1, 1].text(0.5, 0.5, f'K-Means ARI: {kmeans_ari:.3f}\nHierarchical ARI: {hierarchical_ari:.3f}', 
                ha='center', va='center', transform=axes[1, 1].transAxes, fontsize=12)
axes[1, 1].set_title('Cluster Quality Comparison')
axes[1, 1].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# 4. Cluster Characteristics Analysis
print("\n4. Cluster Characteristics:")
print("=" * 28)

# Analyze cluster characteristics
def analyze_clusters(data, labels, feature_names):
    cluster_analysis = {}
    for cluster_id in np.unique(labels):
        cluster_data = data[labels == cluster_id]
        cluster_analysis[cluster_id] = {
            'size': len(cluster_data),
            'mean_values': cluster_data.mean(),
            'std_values': cluster_data.std()
        }
    return cluster_analysis

# Analyze K-Means clusters
kmeans_analysis = analyze_clusters(X_selected, kmeans_labels, X_selected.columns)
hierarchical_analysis = analyze_clusters(X_selected, hierarchical_labels, X_selected.columns)

print("K-Means Cluster Analysis:")
for cluster_id, analysis in kmeans_analysis.items():
    print(f"\nCluster {cluster_id}:")
    print(f"  Size: {analysis['size']}")
    print(f"  Top features by mean value:")
    top_features = analysis['mean_values'].sort_values(ascending=False).head(3)
    for feature, value in top_features.items():
        print(f"    {feature}: {value:.3f}")

print("\nHierarchical Cluster Analysis:")
for cluster_id, analysis in hierarchical_analysis.items():
    print(f"\nCluster {cluster_id}:")
    print(f"  Size: {analysis['size']}")
    print(f"  Top features by mean value:")
    top_features = analysis['mean_values'].sort_values(ascending=False).head(3)
    for feature, value in top_features.items():
        print(f"    {feature}: {value:.3f}")

# Save clustering results
clustering_results = pd.DataFrame({
    'kmeans_labels': kmeans_labels,
    'hierarchical_labels': hierarchical_labels,
    'actual_labels': y
})

clustering_results.to_csv('data/clustering_results.csv', index=False)
print(f"\nClustering results saved to 'data/clustering_results.csv'")
print(f"Best clustering method: {'K-Means' if kmeans_ari > hierarchical_ari else 'Hierarchical'}")
print(f"Best ARI score: {max(kmeans_ari, hierarchical_ari):.4f}")
