# 02 — Player Sub-Roles: Clustering from Qualities (Path A)

Cluster players into sub-roles using the 20 pre-computed Twelve Football quality scores.
For each position we sweep K-means k=2..8, pick the best k by silhouette, then also fit
Hierarchical (Ward) and GMM at that k for comparison.

In [None]:
import matplotlib
matplotlib.use("Agg")

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Dynamic path resolution
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        NB_DIR = d / "thesis_data" / "notebooks" / "player_subroles"
        break

# Load prepared quality data
df = pd.read_parquet(NB_DIR / "player_data_qualities.parquet")
print(f"Loaded player_data_qualities: {df.shape}")

# Load feature maps
with open(NB_DIR / "feature_maps.json", "r") as f:
    feature_maps = json.load(f)

position_quality_features = feature_maps["position_quality_features"]
positions = list(position_quality_features.keys())
print(f"Positions: {positions}")

## 1. Clustering Pipeline

In [None]:
def cluster_position(df, position, feature_cols, k_range=range(2, 9)):
    """
    Cluster players within a position using their quality features.
    
    Returns a dict with cluster labels, scores, centers, and metadata.
    """
    pos_df = df[df["from_position"] == position].dropna(subset=feature_cols).copy()
    X = pos_df[feature_cols].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # K-means sweep
    sil_scores = {}
    inertias = {}
    for k in k_range:
        km = KMeans(n_clusters=k, n_init=10, random_state=42)
        labels = km.fit_predict(X_scaled)
        sil_scores[k] = silhouette_score(X_scaled, labels)
        inertias[k] = km.inertia_
    
    best_k = max(sil_scores, key=sil_scores.get)
    
    # Fit best models
    km_best = KMeans(n_clusters=best_k, n_init=10, random_state=42)
    km_labels = km_best.fit_predict(X_scaled)
    
    hc = AgglomerativeClustering(n_clusters=best_k)
    hc_labels = hc.fit_predict(X_scaled)
    
    gmm = GaussianMixture(n_components=best_k, random_state=42)
    gmm_labels = gmm.fit_predict(X_scaled)
    
    return {
        "position": position,
        "n_players": len(pos_df),
        "best_k": best_k,
        "sil_scores": sil_scores,
        "inertias": inertias,
        "km_labels": km_labels,
        "hc_labels": hc_labels,
        "gmm_labels": gmm_labels,
        "km_sil": silhouette_score(X_scaled, km_labels),
        "hc_sil": silhouette_score(X_scaled, hc_labels),
        "gmm_sil": silhouette_score(X_scaled, gmm_labels),
        "centers": km_best.cluster_centers_,
        "feature_cols": feature_cols,
        "scaler": scaler,
        "pos_df": pos_df,
        "X_scaled": X_scaled,
    }

print("Clustering pipeline defined.")

## 2. Run Clustering for Each Position

In [None]:
results = {}

for pos in positions:
    feat_cols = position_quality_features[pos]
    print(f"\n{'='*60}")
    print(f"Position: {pos}")
    print(f"Features ({len(feat_cols)}): {[c.replace('from_', '') for c in feat_cols]}")
    
    res = cluster_position(df, pos, feat_cols)
    results[pos] = res
    
    print(f"N players (after dropping NaN): {res['n_players']:,}")
    print(f"Best k: {res['best_k']}")
    print(f"Silhouette scores:  KMeans={res['km_sil']:.3f}  HC={res['hc_sil']:.3f}  GMM={res['gmm_sil']:.3f}")
    
    # Cluster sizes
    for method, labels in [("KMeans", res["km_labels"]), ("HC", res["hc_labels"]), ("GMM", res["gmm_labels"])]:
        unique, counts = np.unique(labels, return_counts=True)
        sizes = ", ".join([f"c{u}={c}" for u, c in zip(unique, counts)])
        print(f"  {method} cluster sizes: {sizes}")

print(f"\n{'='*60}")
print(f"Clustering complete for {len(results)} positions.")

## 3. Silhouette Analysis

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 9))
fig.suptitle("K-Means Silhouette Score vs. Number of Clusters (Qualities)", fontsize=14, fontweight="bold")

for idx, pos in enumerate(positions):
    ax = axes[idx // 3, idx % 3]
    res = results[pos]
    ks = sorted(res["sil_scores"].keys())
    sils = [res["sil_scores"][k] for k in ks]
    
    ax.plot(ks, sils, "o-", color="#4A6FA5", linewidth=2, markersize=6)
    ax.plot(res["best_k"], res["sil_scores"][res["best_k"]], "*", color="#E8724A",
            markersize=18, zorder=5, label=f"Best k={res['best_k']}")
    ax.set_title(f"{pos} (n={res['n_players']:,})", fontsize=11)
    ax.set_xlabel("k")
    ax.set_ylabel("Silhouette Score")
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
save_path = NB_DIR / "qualities_silhouette.png"
plt.savefig(save_path, dpi=150, bbox_inches="tight")
plt.show()
print(f"Saved: {save_path}")

## 4. Cluster Profiles (Radar Charts)

In [None]:
def make_radar_chart(centers_original, feature_labels, position, n_clusters, save_path):
    """
    Create a radar chart for cluster centers (in original scale).
    """
    n_features = len(feature_labels)
    angles = np.linspace(0, 2 * np.pi, n_features, endpoint=False).tolist()
    angles += angles[:1]  # close the polygon
    
    # Color palette (blue-gray tones)
    colors = ["#4A6FA5", "#E8724A", "#6B9F6B", "#9B6FA5", "#C4A44A", "#5AAFAF", "#D46A6A", "#8B8B8B"]
    
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    ax.set_title(f"{position} — Cluster Profiles (Qualities)", fontsize=13, fontweight="bold", pad=20)
    
    for i in range(n_clusters):
        values = centers_original[i].tolist()
        values += values[:1]  # close
        ax.plot(angles, values, "o-", linewidth=2, label=f"Cluster {i}", color=colors[i % len(colors)])
        ax.fill(angles, values, alpha=0.1, color=colors[i % len(colors)])
    
    # Labels
    short_labels = [lbl.replace("from_", "") for lbl in feature_labels]
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(short_labels, fontsize=8)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1), fontsize=9)
    
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"Saved: {save_path}")


for pos in positions:
    res = results[pos]
    # Transform centers back to original scale
    centers_original = res["scaler"].inverse_transform(res["centers"])
    feature_labels = res["feature_cols"]
    
    save_path = NB_DIR / f"qualities_radar_{pos.replace(' ', '_').lower()}.png"
    make_radar_chart(centers_original, feature_labels, pos, res["best_k"], save_path)

## 5. Method Comparison

In [None]:
comparison_rows = []
for pos in positions:
    res = results[pos]
    methods_sil = {"KMeans": res["km_sil"], "Hierarchical": res["hc_sil"], "GMM": res["gmm_sil"]}
    best_method = max(methods_sil, key=methods_sil.get)
    comparison_rows.append({
        "Position": pos,
        "Best k": res["best_k"],
        "KMeans Sil": round(res["km_sil"], 4),
        "Hierarchical Sil": round(res["hc_sil"], 4),
        "GMM Sil": round(res["gmm_sil"], 4),
        "Best Method": best_method,
    })

comp_df = pd.DataFrame(comparison_rows)
print("=" * 85)
print("Method Comparison (Path A: Qualities)")
print("=" * 85)
print(comp_df.to_string(index=False))

## 6. Cluster Naming and Export

In [None]:
# For each position, analyze cluster centers and suggest names
all_labeled_dfs = []

for pos in positions:
    res = results[pos]
    pos_df = res["pos_df"].copy()
    feature_cols = res["feature_cols"]
    centers_original = res["scaler"].inverse_transform(res["centers"])
    best_k = res["best_k"]
    
    # Pick the best method's labels
    methods_sil = {"KMeans": res["km_sil"], "Hierarchical": res["hc_sil"], "GMM": res["gmm_sil"]}
    best_method = max(methods_sil, key=methods_sil.get)
    if best_method == "KMeans":
        labels = res["km_labels"]
    elif best_method == "Hierarchical":
        labels = res["hc_labels"]
    else:
        labels = res["gmm_labels"]
    
    pos_df["cluster_id"] = labels
    pos_df["cluster_method"] = best_method
    
    # Analyze cluster profiles: top 3 qualities per cluster (by center value)
    print(f"\n{'='*60}")
    print(f"{pos} (k={best_k}, method={best_method})")
    print(f"{'='*60}")
    
    short_names = [c.replace("from_", "") for c in feature_cols]
    cluster_names = []
    
    for c_id in range(best_k):
        center = centers_original[c_id]
        # Rank features by value (higher = more defining)
        sorted_idx = np.argsort(center)[::-1]
        top3 = [(short_names[i], center[i]) for i in sorted_idx[:3]]
        
        # Generate a descriptive name from top qualities
        top_quality = short_names[sorted_idx[0]]
        second_quality = short_names[sorted_idx[1]]
        
        # Heuristic naming based on dominant quality combinations
        name = f"{pos.split()[0]}_{top_quality.replace(' ', '_')}_{c_id}"
        cluster_names.append(name)
        
        n_in_cluster = np.sum(labels == c_id)
        print(f"\n  Cluster {c_id} (n={n_in_cluster:,}): '{name}'")
        print(f"    Top 3 defining qualities:")
        for q_name, q_val in top3:
            print(f"      {q_name:30s} = {q_val:.2f}")
    
    # Map cluster names
    name_map = {i: cluster_names[i] for i in range(best_k)}
    pos_df["cluster_name"] = pos_df["cluster_id"].map(name_map)
    
    all_labeled_dfs.append(pos_df)

# Combine all positions
df_labeled = pd.concat(all_labeled_dfs, ignore_index=True)

# Select output columns
id_cols = ["wy_player_id", "from_position", "cluster_id", "cluster_name", "cluster_method"]
# Add all quality columns
all_qual_cols = [c for c in df_labeled.columns if c.startswith("from_") and c != "from_position" and c != "from_Minutes"]
# Also keep from_team_id, from_season, from_Minutes
extra_id = ["from_team_id", "from_season", "from_Minutes"]
# Check for player name
name_col = [c for c in df_labeled.columns if c in ["from_player_name", "from_short_name", "player_name"]]

out_cols = id_cols + name_col + extra_id + all_qual_cols
# Remove duplicates while preserving order
seen = set()
out_cols_unique = []
for c in out_cols:
    if c not in seen and c in df_labeled.columns:
        seen.add(c)
        out_cols_unique.append(c)

df_export = df_labeled[out_cols_unique].copy()

save_path = NB_DIR / "player_subroles_qualities.parquet"
df_export.to_parquet(save_path, index=False)
print(f"\nExported: {save_path}")
print(f"Shape: {df_export.shape}")
print(f"\nCluster distribution:")
print(df_export.groupby(["from_position", "cluster_name"]).size().reset_index(name="count").to_string(index=False))