# Step 7: Clustering Engine - Interactive Analysis

This notebook implements student clustering using:
- Academic features (GPA, attendance)
- Skill embeddings (PCA components)
- Skill gaps
- Career predictions

**Algorithms**: KMeans, DBSCAN
**Visualizations**: t-SNE, UMAP, 3D PCA

In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Try to import UMAP (optional)
try:
    import umap
    UMAP_AVAILABLE = True
    print("âœ“ UMAP available")
except ImportError:
    UMAP_AVAILABLE = False
    print("âš  UMAP not available (install with: pip install umap-learn)")

print("Libraries loaded successfully")

## 1. Load Data

In [None]:
BASE = Path(".")

# Load features
df_features = pd.read_csv(BASE / "models" / "features_all.csv")
print(f"Loaded {len(df_features)} student features")

# Load embeddings
with open(BASE / "embeddings" / "embeddings_students.pkl", "rb") as f:
    emb_students = pickle.load(f)
print(f"Loaded student embeddings: {type(emb_students)}")

# Load skill gap profiles
with open(BASE / "skill_gap_profiles" / "student_profiles.json", "r") as f:
    profiles = json.load(f)
profiles_map = {p['student_id']: p for p in profiles}
print(f"Loaded {len(profiles)} skill gap profiles")

# Load student data
df_students = pd.read_csv(BASE / "digital_twin_students_1500_cleaned.csv", low_memory=False)
print(f"Loaded {len(df_students)} student records")

df_features.head()

## 2. Build Feature Matrix

In [None]:
def build_feature_matrix():
    """Build comprehensive feature matrix for clustering"""
    features_list = []
    student_ids = []
    
    for _, row in df_features.iterrows():
        student_id = row['StudentID']
        student_ids.append(student_id)
        
        feature_vec = []
        
        # 1. Academic features
        academic_cols = ['GPA', 'Attendance', 'FailedCourses', 'CompletedCourses']
        for col in academic_cols:
            if col in row:
                feature_vec.append(row[col] if pd.notna(row[col]) else 0)
        
        # 2. PCA embedding features (32 dimensions)
        emb_cols = [col for col in df_features.columns if col.startswith('emb_pca_')]
        for col in emb_cols[:32]:
            if col in row:
                feature_vec.append(row[col] if pd.notna(row[col]) else 0)
        
        # 3. Skill gap features
        profile = profiles_map.get(student_id, {})
        skill_gaps = profile.get('skill_gaps', {})
        missing_skills = skill_gaps.get('missing_skills', [])
        feature_vec.append(len(missing_skills))
        
        if 'priority_scores' in skill_gaps and skill_gaps['priority_scores']:
            top_priority = max(skill_gaps['priority_scores'].values())
            feature_vec.append(top_priority)
        else:
            feature_vec.append(0)
        
        # 4. Career prediction one-hot
        career_categories = ['Data', 'Machine Learning', 'Cloud', 'Cybersecurity', 
                           'Software', 'Network', 'DevOps', 'Other']
        predicted_career = row.get('predicted_career', 'Other')
        for cat in career_categories:
            feature_vec.append(1 if predicted_career == cat else 0)
        
        features_list.append(feature_vec)
    
    X = np.array(features_list)
    return X, student_ids

X, student_ids = build_feature_matrix()
print(f"Feature matrix shape: {X.shape}")
print(f"Features per student: {X.shape[1]}")

## 3. Normalize Features

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Scaled feature matrix: {X_scaled.shape}")
print(f"Mean: {X_scaled.mean():.4f}, Std: {X_scaled.std():.4f}")

## 4. KMeans Clustering

In [None]:
# Run KMeans with 7 clusters
n_clusters = 7
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

print(f"KMeans clustering complete")
print(f"\nCluster distribution:")
cluster_counts = Counter(cluster_labels)
for cluster_id, count in sorted(cluster_counts.items()):
    print(f"  Cluster {cluster_id}: {count} students ({count/len(student_ids)*100:.1f}%)")

## 5. Evaluate Clustering Quality

In [None]:
# Silhouette score
silhouette = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score: {silhouette:.3f}")
print(f"  (Range: -1 to 1, higher is better)")

# Davies-Bouldin score
davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
print(f"\nDavies-Bouldin Score: {davies_bouldin:.3f}")
print(f"  (Lower is better)")

## 6. Map Clusters to Career Labels

In [None]:
def map_cluster_to_career(cluster_id, student_indices):
    careers = []
    for idx in student_indices:
        student_id = student_ids[idx]
        row = df_features[df_features['StudentID'] == student_id]
        if not row.empty:
            career = row.iloc[0].get('predicted_career', 'Other')
            careers.append(career)
    
    if careers:
        return Counter(careers).most_common(1)[0][0]
    return "Other"

cluster_career_map = {}
for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    career_label = map_cluster_to_career(cluster_id, cluster_indices)
    cluster_career_map[cluster_id] = career_label
    print(f"Cluster {cluster_id} â†’ {career_label}")

## 7. Visualize Clusters with t-SNE

In [None]:
# t-SNE dimensionality reduction
print("Running t-SNE (this may take a minute)...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(14, 10))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=cluster_labels, 
                     cmap='tab10', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
plt.colorbar(scatter, label='Cluster')
plt.title('Student Clusters (t-SNE Visualization)', fontsize=18, fontweight='bold')
plt.xlabel('t-SNE Component 1', fontsize=14)
plt.ylabel('t-SNE Component 2', fontsize=14)
plt.grid(alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

print("âœ“ t-SNE visualization complete")

## 8. UMAP Visualization (Optional Enhancement)

In [None]:
if UMAP_AVAILABLE:
    print("Running UMAP (this may take a minute)...")
    
    # UMAP dimensionality reduction
    reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
    X_umap = reducer.fit_transform(X_scaled)
    
    # Plot
    plt.figure(figsize=(14, 10))
    scatter = plt.scatter(X_umap[:, 0], X_umap[:, 1], c=cluster_labels, 
                         cmap='tab10', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
    plt.colorbar(scatter, label='Cluster')
    plt.title('Student Clusters (UMAP Visualization)', fontsize=18, fontweight='bold')
    plt.xlabel('UMAP Component 1', fontsize=14)
    plt.ylabel('UMAP Component 2', fontsize=14)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    plt.show()
    
    print("âœ“ UMAP visualization complete")
    print("\nðŸ“Š UMAP vs t-SNE:")
    print("  - UMAP preserves global structure better")
    print("  - UMAP is faster for large datasets")
    print("  - t-SNE focuses on local neighborhoods")
else:
    print("âš  UMAP not available")
    print("Install with: pip install umap-learn")

## 9. 3D PCA Visualization

In [None]:
# 3D PCA for cluster visualization
from mpl_toolkits.mplot3d import Axes3D

pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled)

# Create 3D plot
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], 
                     c=cluster_labels, cmap='tab10', alpha=0.6, s=50, 
                     edgecolors='black', linewidth=0.5)

ax.set_xlabel('PC1', fontsize=12)
ax.set_ylabel('PC2', fontsize=12)
ax.set_zlabel('PC3', fontsize=12)
ax.set_title('Student Clusters (3D PCA Visualization)', fontsize=18, fontweight='bold')

# Add colorbar
cbar = plt.colorbar(scatter, ax=ax, pad=0.1)
cbar.set_label('Cluster', fontsize=12)

plt.tight_layout()
plt.show()

print(f"âœ“ 3D PCA visualization complete")
print(f"Explained variance: {pca_3d.explained_variance_ratio_.sum():.2%}")

## 10. Compute Similar Students

In [None]:
# Compute cosine similarity
print("Computing similarity matrix...")
similarity_matrix = cosine_similarity(X_scaled)

# Find top 10 similar students
similar_students = {}
for i, student_id in enumerate(student_ids):
    similarities = similarity_matrix[i]
    top_indices = np.argsort(similarities)[::-1][1:11]
    similar_ids = [student_ids[idx] for idx in top_indices]
    similar_students[student_id] = similar_ids

print(f"âœ“ Computed similarities for {len(similar_students)} students")
print(f"\nExample - Similar students to S0001:")
print(similar_students.get('S0001', [])[:5])

## 11. Generate Cluster Profiles

In [None]:
cluster_profiles = {}

for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    cluster_student_ids = [student_ids[idx] for idx in cluster_indices]
    
    cluster_students = df_students[df_students['StudentID'].isin(cluster_student_ids)]
    
    avg_gpa = cluster_students['GPA'].mean() if 'GPA' in cluster_students else 0
    avg_attendance = cluster_students['Attendance'].mean() if 'Attendance' in cluster_students else 0
    
    # Get top missing skills
    all_missing_skills = []
    for sid in cluster_student_ids:
        profile = profiles_map.get(sid, {})
        missing = profile.get('skill_gaps', {}).get('missing_skills', [])
        all_missing_skills.extend(missing[:5])
    
    top_missing = Counter(all_missing_skills).most_common(10)
    
    cluster_profiles[cluster_career_map[cluster_id]] = {
        "cluster_id": int(cluster_id),
        "career_label": cluster_career_map[cluster_id],
        "member_count": len(cluster_student_ids),
        "avg_gpa": float(avg_gpa),
        "avg_attendance": float(avg_attendance),
        "top_missing_skills": [skill for skill, count in top_missing]
    }

# Display cluster profiles
pd.DataFrame(cluster_profiles).T

## 12. Save Outputs

In [None]:
# Save clusters.json
clusters_output = {}
for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    cluster_student_ids = [student_ids[idx] for idx in cluster_indices]
    career_label = cluster_career_map[cluster_id]
    clusters_output[career_label] = cluster_student_ids

with open(BASE / "clusters.json", "w") as f:
    json.dump(clusters_output, f, indent=2)
print("âœ“ Saved clusters.json")

# Save similar_students.json
with open(BASE / "similar_students.json", "w") as f:
    json.dump(similar_students, f, indent=2)
print("âœ“ Saved similar_students.json")

# Save cluster_profiles.json
with open(BASE / "cluster_profiles.json", "w") as f:
    json.dump(cluster_profiles, f, indent=2)
print("âœ“ Saved cluster_profiles.json")

# Save cluster assignments
cluster_assignments = {
    student_ids[i]: {
        "cluster_id": int(cluster_labels[i]),
        "cluster_label": cluster_career_map[cluster_labels[i]]
    }
    for i in range(len(student_ids))
}

with open(BASE / "cluster_assignments.json", "w") as f:
    json.dump(cluster_assignments, f, indent=2)
print("âœ“ Saved cluster_assignments.json")

print("\nâœ… All outputs saved successfully!")
print(f"\nðŸ“Š Summary:")
print(f"  - {len(student_ids)} students clustered")
print(f"  - {n_clusters} clusters created")
print(f"  - Silhouette score: {silhouette:.3f}")
print(f"  - Similarity network: {len(similar_students)} students")