# Unsupervised Clustering - Student Segmentation

## Objective
Segment students into distinct learning personas using K-Means clustering based on behavioural features.

## CRISP-DM Stage
Modelling and Evaluation

## Methods
- Feature selection and standardisation
- Elbow Method for optimal k determination
- Silhouette Score analysis
- K-Means clustering
- PCA visualisation
- Cluster profiling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('All libraries imported successfully')
print('=' * 80)

## Section 1: Load and Prepare Clustering Features

Select and standardise behavioural features for clustering.

In [None]:
print('\n' + '=' * 80)
print('CLUSTERING: FEATURE PREPARATION')
print('=' * 80)

df_encoded = pd.read_pickle('../2_Outputs/df_encoded_full.pkl')

clustering_features = [
    'total_clicks', 'avg_clicks_per_day', 'avg_clicks_per_week', 
    'days_active', 'late_clicks',
    'score_mean', 'submit_delay_mean', 'num_late_submissions',
    'num_assessments', 'target_score'
]

clustering_features = [col for col in clustering_features if col in df_encoded.columns]
print(f'Clustering features selected ({len(clustering_features)}): {clustering_features}')

X_clustering = df_encoded[clustering_features].fillna(0)

print(f'\nClustering dataset shape: {X_clustering.shape}')
print(f'Feature statistics:')
print(X_clustering.describe())

scaler = StandardScaler()
X_clustering_scaled = scaler.fit_transform(X_clustering)

print(f'\nScaled features shape: {X_clustering_scaled.shape}')

## Section 2: Determine Optimal Clusters - Elbow Method

Identify optimal number of clusters by analysing within-cluster sum of squares.

In [None]:
print('\n' + '=' * 80)
print('ELBOW METHOD FOR OPTIMAL K')
print('=' * 80)

inertias = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_clustering_scaled)
    inertias.append(kmeans.inertia_)
    print(f'k={k}: Inertia = {kmeans.inertia_:.4f}')

plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, marker='o', linestyle='-', linewidth=2, markersize=8, color='steelblue')
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('Within-Cluster Sum of Squares (Inertia)', fontsize=12)
plt.title('Elbow Method for Optimal k', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(k_range)
plt.tight_layout()
plt.show()

## Section 3: Determine Optimal Clusters - Silhouette Score

Evaluate cluster quality using Silhouette Score analysis.

In [None]:
print('\n' + '=' * 80)
print('SILHOUETTE SCORE FOR OPTIMAL K')
print('=' * 80)

silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_clustering_scaled)
    silhouette_avg = silhouette_score(X_clustering_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f'k={k}: Silhouette Score = {silhouette_avg:.4f}')

optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]
print(f'\nOptimal k (based on Silhouette Score): {optimal_k_silhouette}')

plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, marker='s', linestyle='-', linewidth=2, markersize=8, color='coral')
plt.axvline(x=optimal_k_silhouette, color='red', linestyle='--', label=f'Optimal k={optimal_k_silhouette}')
plt.xlabel('Number of Clusters (k)', fontsize=12)
plt.ylabel('Silhouette Score', fontsize=12)
plt.title('Silhouette Score for Different k Values', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(k_range)
plt.legend()
plt.tight_layout()
plt.show()

optimal_k = 4  # Default can be adjusted based on analysis
print(f'\nSelected optimal k: {optimal_k}')

## Section 4: K-Means Clustering

Implement K-Means with optimal number of clusters.

In [None]:
print('\n' + '=' * 80)
print('K-MEANS CLUSTERING')
print('=' * 80)

kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_clustering_scaled)

df_encoded['cluster'] = cluster_labels

print(f'K-Means converged in {kmeans_final.n_iter_} iterations')
print(f'\nCluster Distribution:')
print(df_encoded['cluster'].value_counts().sort_index())

silhouette_avg = silhouette_score(X_clustering_scaled, cluster_labels)
print(f'\nSilhouette Score (final model): {silhouette_avg:.4f}')

sample_silhouette_values = silhouette_samples(X_clustering_scaled, cluster_labels)
print(f'Average sample Silhouette Value: {sample_silhouette_values.mean():.4f}')

## Section 5: PCA Visualisation

Reduce dimensions to 2D for cluster visualisation.

In [None]:
print('\n' + '=' * 80)
print('PCA DIMENSIONALITY REDUCTION')
print('=' * 80)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_clustering_scaled)

print(f'PCA Explained Variance Ratio: {pca.explained_variance_ratio_}')
print(f'Total Variance Explained: {pca.explained_variance_ratio_.sum():.4f}')

pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['cluster'] = cluster_labels

# Transform centroids for visualisation
centroids_pca = pca.transform(kmeans_final.cluster_centers_)

# Create scatter plot
plt.figure(figsize=(12, 8))

colours = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
for i in range(optimal_k):
    mask = cluster_labels == i
    plt.scatter(
        X_pca[mask, 0], X_pca[mask, 1],
        c=colours[i % len(colours)],
        label=f'Cluster {i}',
        alpha=0.6,
        s=50,
        edgecolors='black',
        linewidth=0.5
    )

plt.scatter(
    centroids_pca[:, 0], centroids_pca[:, 1],
    c='red', marker='*', s=500,
    label='Centroids',
    edgecolors='black',
    linewidth=2
)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', fontsize=12)
plt.title(f'K-Means Clustering Results (k={optimal_k}) - PCA Visualisation', fontsize=14, fontweight='bold')
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Section 6: Cluster Profiling

Analyse cluster characteristics and create learning personas.

In [None]:
print('\n' + '=' * 80)
print('CLUSTER PROFILING AND INTERPRETATION')
print('=' * 80)

cluster_profiles = df_encoded.groupby('cluster')[clustering_features].mean()

print('\nCluster Profiles (Mean Feature Values):')
print('=' * 80)
print(cluster_profiles.round(4))

# Create learning persona names
cluster_names = {}
for cluster_id in range(optimal_k):
    profile = cluster_profiles.loc[cluster_id]
    
    engagement = 'High' if profile['total_clicks'] > cluster_profiles['total_clicks'].median() else 'Low'
    performance = 'High' if profile['target_score'] > cluster_profiles['target_score'].median() else 'Low'
    timeliness = 'On-Time' if profile['submit_delay_mean'] < cluster_profiles['submit_delay_mean'].median() else 'Late'
    
    cluster_names[cluster_id] = f'{engagement} Engagement, {performance} Performance, {timeliness}'

print('\n\nLearning Personas:')
print('=' * 80)
for cluster_id, name in cluster_names.items():
    count = (cluster_labels == cluster_id).sum()
    pct = count / len(cluster_labels) * 100
    print(f'\nCluster {cluster_id}: {name}')
    print(f'  Size: {count} students ({pct:.1f}%)')

# Visualise cluster profiles with heatmap
cluster_profiles_normalized = (cluster_profiles - cluster_profiles.min()) / (cluster_profiles.max() - cluster_profiles.min())

fig, ax = plt.subplots(figsize=(14, 6))
sns.heatmap(
    cluster_profiles_normalized.T,
    annot=cluster_profiles.T.round(2),
    fmt='g',
    cmap='YlGnBu',
    cbar_kws={'label': 'Normalised Value'},
    ax=ax,
    linewidths=0.5
)

ax.set_title('Cluster Profiles Heatmap (Normalised Features)', fontsize=14, fontweight='bold')
ax.set_xlabel('Cluster', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.show()

# Save clustering results
import pickle
pickle.dump(kmeans_final, open('../2_Outputs/kmeans_model.pkl', 'wb'))
pickle.dump(cluster_labels, open('../2_Outputs/cluster_labels.pkl', 'wb'))
df_encoded.to_pickle('../2_Outputs/df_with_clusters.pkl')

print('\nClustering results saved for final notebook')