# 03 — Player Sub-Roles: Clustering from Per-90 Z-Scores (Path B)

Cluster players into sub-roles using the ~75 per-90 z-score columns.
Because the feature space is high-dimensional, we first apply PCA to reduce
dimensions (retaining 90% variance), then cluster in PCA space.

In [None]:
import matplotlib
matplotlib.use("Agg")

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Dynamic path resolution
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        NB_DIR = d / "thesis_data" / "notebooks" / "player_subroles"
        break

# Load prepared z-score data
df = pd.read_parquet(NB_DIR / "player_data_zscores.parquet")
print(f"Loaded player_data_zscores: {df.shape}")

# Load feature maps
with open(NB_DIR / "feature_maps.json", "r") as f:
    feature_maps = json.load(f)

position_zscore_features = feature_maps["position_zscore_features"]
positions = list(position_zscore_features.keys())
print(f"Positions: {positions}")

## 1. Dimensionality Reduction with PCA

In [None]:
pca_results = {}
variance_threshold = 0.90

fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle("PCA Cumulative Explained Variance (Per-90 Z-Scores)", fontsize=14, fontweight="bold")

for idx, pos in enumerate(positions):
    ax = axes[idx // 3, idx % 3]
    feat_cols = position_zscore_features[pos]
    
    # Get position data, drop NaN rows
    pos_data = df[df["from_position"] == pos].dropna(subset=feat_cols).copy()
    X = pos_data[feat_cols].values
    
    # Standardize (z-scores are already standardized, but re-scale for consistency)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Full PCA
    pca_full = PCA(random_state=42)
    pca_full.fit(X_scaled)
    cum_var = np.cumsum(pca_full.explained_variance_ratio_)
    
    # Find n_components for 90% variance
    n_comp_90 = np.argmax(cum_var >= variance_threshold) + 1
    
    # Fit PCA with selected components
    pca = PCA(n_components=n_comp_90, random_state=42)
    X_pca = pca.fit_transform(X_scaled)
    
    pca_results[pos] = {
        "pos_data": pos_data,
        "X_scaled": X_scaled,
        "X_pca": X_pca,
        "pca": pca,
        "scaler": scaler,
        "n_components": n_comp_90,
        "cum_var": cum_var,
        "feature_cols": feat_cols,
    }
    
    # Plot
    n_show = min(30, len(cum_var))
    ax.plot(range(1, n_show + 1), cum_var[:n_show], "o-", color="#4A6FA5", linewidth=2, markersize=4)
    ax.axhline(y=variance_threshold, color="#E8724A", linestyle="--", alpha=0.7, label=f"90% threshold")
    ax.axvline(x=n_comp_90, color="#6B9F6B", linestyle="--", alpha=0.7, label=f"n={n_comp_90}")
    ax.set_title(f"{pos} (n={len(pos_data):,}, features={len(feat_cols)})", fontsize=10)
    ax.set_xlabel("# Components")
    ax.set_ylabel("Cumulative Variance")
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
    
    print(f"{pos}: {len(feat_cols)} features -> {n_comp_90} PCA components (90% variance)")

plt.tight_layout()
save_path = NB_DIR / "zscores_pca_variance.png"
plt.savefig(save_path, dpi=150, bbox_inches="tight")
plt.show()
print(f"\nSaved: {save_path}")

## 2. Clustering in PCA Space

In [None]:
def cluster_position_pca(X_pca, pos_data, position, k_range=range(2, 9)):
    """
    Cluster players in PCA-reduced feature space.
    X_pca is already scaled/reduced.
    """
    # K-means sweep
    sil_scores = {}
    inertias = {}
    for k in k_range:
        km = KMeans(n_clusters=k, n_init=10, random_state=42)
        labels = km.fit_predict(X_pca)
        sil_scores[k] = silhouette_score(X_pca, labels)
        inertias[k] = km.inertia_
    
    best_k = max(sil_scores, key=sil_scores.get)
    
    # Fit best models
    km_best = KMeans(n_clusters=best_k, n_init=10, random_state=42)
    km_labels = km_best.fit_predict(X_pca)
    
    hc = AgglomerativeClustering(n_clusters=best_k)
    hc_labels = hc.fit_predict(X_pca)
    
    gmm = GaussianMixture(n_components=best_k, random_state=42)
    gmm_labels = gmm.fit_predict(X_pca)
    
    return {
        "position": position,
        "n_players": len(pos_data),
        "best_k": best_k,
        "sil_scores": sil_scores,
        "inertias": inertias,
        "km_labels": km_labels,
        "hc_labels": hc_labels,
        "gmm_labels": gmm_labels,
        "km_sil": silhouette_score(X_pca, km_labels),
        "hc_sil": silhouette_score(X_pca, hc_labels),
        "gmm_sil": silhouette_score(X_pca, gmm_labels),
        "centers_pca": km_best.cluster_centers_,
        "pos_data": pos_data,
        "X_pca": X_pca,
    }


cluster_results = {}

for pos in positions:
    pr = pca_results[pos]
    print(f"\n{'='*60}")
    print(f"Position: {pos}  (PCA dims: {pr['n_components']})")
    
    res = cluster_position_pca(pr["X_pca"], pr["pos_data"], pos)
    cluster_results[pos] = res
    
    print(f"N players: {res['n_players']:,}")
    print(f"Best k: {res['best_k']}")
    print(f"Silhouette:  KMeans={res['km_sil']:.3f}  HC={res['hc_sil']:.3f}  GMM={res['gmm_sil']:.3f}")
    
    for method, labels in [("KMeans", res["km_labels"]), ("HC", res["hc_labels"]), ("GMM", res["gmm_labels"])]:
        unique, counts = np.unique(labels, return_counts=True)
        sizes = ", ".join([f"c{u}={c}" for u, c in zip(unique, counts)])
        print(f"  {method} cluster sizes: {sizes}")

print(f"\nClustering complete for {len(cluster_results)} positions.")

## 3. Silhouette Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle("K-Means Silhouette Score vs. Number of Clusters (Z-Scores + PCA)", fontsize=14, fontweight="bold")

for idx, pos in enumerate(positions):
    ax = axes[idx // 3, idx % 3]
    res = cluster_results[pos]
    ks = sorted(res["sil_scores"].keys())
    sils = [res["sil_scores"][k] for k in ks]
    
    ax.plot(ks, sils, "o-", color="#4A6FA5", linewidth=2, markersize=6)
    ax.plot(res["best_k"], res["sil_scores"][res["best_k"]], "*", color="#E8724A",
            markersize=18, zorder=5, label=f"Best k={res['best_k']}")
    ax.set_title(f"{pos} (n={res['n_players']:,})", fontsize=11)
    ax.set_xlabel("k")
    ax.set_ylabel("Silhouette Score")
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
save_path = NB_DIR / "zscores_silhouette.png"
plt.savefig(save_path, dpi=150, bbox_inches="tight")
plt.show()
print(f"Saved: {save_path}")

## 4. Cluster Profiles (Radar Charts)

Since clustering was done in PCA space, we transform the PCA cluster centers back to the
original feature space to create interpretable radar charts. We select the top-10 features
by variance across clusters for readability.

In [None]:
def make_radar_chart_zscore(centers_original, feature_labels, position, n_clusters, save_path, max_features=12):
    """
    Create a radar chart from original-scale centers.
    If too many features, select the top ones by cross-cluster variance.
    """
    # Select top features by variance across cluster centers
    if len(feature_labels) > max_features:
        var_across = np.var(centers_original, axis=0)
        top_idx = np.argsort(var_across)[::-1][:max_features]
        top_idx = np.sort(top_idx)  # keep original order
        centers_original = centers_original[:, top_idx]
        feature_labels = [feature_labels[i] for i in top_idx]
    
    n_features = len(feature_labels)
    angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
    angles += angles[:1]
    
    colors = ["#4A6FA5", "#E8724A", "#6B9F6B", "#9B6FA5", "#C4A44A", "#5AAFAF", "#D46A6A", "#8B8B8B"]
    
    fig, ax = plt.subplots(figsize=(9, 9), subplot_kw=dict(polar=True))
    ax.set_title(f"{position} — Cluster Profiles (Z-Scores, top-{n_features} features)",
                 fontsize=13, fontweight="bold", pad=20)
    
    for i in range(n_clusters):
        values = centers_original[i].tolist()
        values += values[:1]
        ax.plot(angles, values, "o-", linewidth=2, label=f"Cluster {i}", color=colors[i % len(colors)])
        ax.fill(angles, values, alpha=0.1, color=colors[i % len(colors)])
    
    # Clean up label names
    short_labels = [lbl.replace("from_z_score_", "").replace(" per 90", "/90") for lbl in feature_labels]
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(short_labels, fontsize=7)
    ax.legend(loc="upper right", bbox_to_anchor=(1.35, 1.1), fontsize=9)
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"Saved: {save_path}")


for pos in positions:
    res = cluster_results[pos]
    pr = pca_results[pos]
    
    # Transform PCA centers back to original feature space
    # centers_pca shape: (k, n_components)
    # Inverse PCA: X_original_scaled = centers_pca @ pca.components_ + pca.mean_
    centers_scaled = pr["pca"].inverse_transform(res["centers_pca"])
    centers_original = pr["scaler"].inverse_transform(centers_scaled)
    
    feature_labels = pr["feature_cols"]
    save_path = NB_DIR / f"zscores_radar_{pos.replace(' ', '_').lower()}.png"
    make_radar_chart_zscore(centers_original, feature_labels, pos, res["best_k"], save_path)

## 5. Method Comparison

In [None]:
comparison_rows = []
for pos in positions:
    res = cluster_results[pos]
    pr = pca_results[pos]
    methods_sil = {"KMeans": res["km_sil"], "Hierarchical": res["hc_sil"], "GMM": res["gmm_sil"]}
    best_method = max(methods_sil, key=methods_sil.get)
    comparison_rows.append({
        "Position": pos,
        "PCA Dims": pr["n_components"],
        "Best k": res["best_k"],
        "KMeans Sil": round(res["km_sil"], 4),
        "Hierarchical Sil": round(res["hc_sil"], 4),
        "GMM Sil": round(res["gmm_sil"], 4),
        "Best Method": best_method,
    })

comp_df = pd.DataFrame(comparison_rows)
print("=" * 95)
print("Method Comparison (Path B: Z-Scores + PCA)")
print("=" * 95)
print(comp_df.to_string(index=False))

## 6. Cluster Naming and Export

In [None]:
all_labeled_dfs = []

for pos in positions:
    res = cluster_results[pos]
    pr = pca_results[pos]
    pos_df = pr["pos_data"].copy()
    feature_cols = pr["feature_cols"]
    best_k = res["best_k"]
    
    # Transform centers back to original space for naming
    centers_scaled = pr["pca"].inverse_transform(res["centers_pca"])
    centers_original = pr["scaler"].inverse_transform(centers_scaled)
    
    # Pick best method's labels
    methods_sil = {"KMeans": res["km_sil"], "Hierarchical": res["hc_sil"], "GMM": res["gmm_sil"]}
    best_method = max(methods_sil, key=methods_sil.get)
    if best_method == "KMeans":
        labels = res["km_labels"]
    elif best_method == "Hierarchical":
        labels = res["hc_labels"]
    else:
        labels = res["gmm_labels"]
    
    pos_df["cluster_id"] = labels
    pos_df["cluster_method"] = best_method
    
    # Analyze cluster profiles
    print(f"\n{'='*60}")
    print(f"{pos} (k={best_k}, method={best_method})")
    print(f"{'='*60}")
    
    short_names = [c.replace("from_z_score_", "").replace(" per 90", "/90") for c in feature_cols]
    cluster_names = []
    
    for c_id in range(best_k):
        center = centers_original[c_id]
        sorted_idx = np.argsort(center)[::-1]
        top3 = [(short_names[i], center[i]) for i in sorted_idx[:3]]
        
        top_metric = short_names[sorted_idx[0]]
        # Clean up name for label
        clean_top = top_metric.replace("/90", "").replace(" %", "").strip()
        clean_top = clean_top[:20]  # truncate if too long
        name = f"{pos.split()[0]}_{clean_top.replace(' ', '_')}_{c_id}"
        cluster_names.append(name)
        
        n_in_cluster = np.sum(labels == c_id)
        print(f"\n  Cluster {c_id} (n={n_in_cluster:,}): '{name}'")
        print(f"    Top 3 defining z-score metrics:")
        for m_name, m_val in top3:
            print(f"      {m_name:45s} = {m_val:.3f}")
    
    name_map = {i: cluster_names[i] for i in range(best_k)}
    pos_df["cluster_name"] = pos_df["cluster_id"].map(name_map)
    
    all_labeled_dfs.append(pos_df)

# Combine
df_labeled = pd.concat(all_labeled_dfs, ignore_index=True)

# Select output columns
id_cols = ["wy_player_id", "from_position", "cluster_id", "cluster_name", "cluster_method"]
extra_id = ["from_team_id", "from_season", "from_Minutes"]
name_col = [c for c in df_labeled.columns if c in ["from_player_name", "from_short_name", "player_name"]]
zscore_cols_in_df = [c for c in df_labeled.columns if c.startswith("from_z_score_")]

out_cols = id_cols + name_col + extra_id + zscore_cols_in_df
seen = set()
out_cols_unique = []
for c in out_cols:
    if c not in seen and c in df_labeled.columns:
        seen.add(c)
        out_cols_unique.append(c)

df_export = df_labeled[out_cols_unique].copy()

save_path = NB_DIR / "player_subroles_zscores.parquet"
df_export.to_parquet(save_path, index=False)
print(f"\nExported: {save_path}")
print(f"Shape: {df_export.shape}")
print(f"\nCluster distribution:")
print(df_export.groupby(["from_position", "cluster_name"]).size().reset_index(name="count").to_string(index=False))