# 03 — Cluster Team Playing Styles

Clusters teams using the 6 style qualities (excluding outcome) with three methods:
- **K-Means** — hard clustering, spherical assumption
- **Hierarchical (Ward)** — reveals nested structure
- **Gaussian Mixture Models** — soft clustering (probabilistic)

**Output:** `team_style_clusters.parquet` with cluster assignments for every team-season.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, silhouette_samples, adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, linkage
import warnings
warnings.filterwarnings('ignore')

docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        NB_DIR = d / "thesis_data" / "notebooks" / "team_styles"
        break

qdf = pd.read_parquet(BASE / "Teams_stats" / "team_qualities.parquet")
style_cols = ["defence", "defensive_transition", "attacking_transition", "attack", "penetration", "chance_creation"]

PALETTE = ["#1A237E", "#1565C0", "#1E88E5", "#42A5F5", "#546E7A", "#78909C", "#37474F", "#455A64"]
LABELS_PRETTY = [c.replace('_', ' ').title() for c in style_cols]

# Prepare clustering data
X_raw = qdf[style_cols].dropna()
valid_idx = X_raw.index
scaler = StandardScaler()
X = scaler.fit_transform(X_raw)
print(f"Clustering: {X.shape[0]:,} team-seasons × {X.shape[1]} style qualities")

## 1. K-Means: Elbow & Silhouette Analysis

In [None]:
K_RANGE = range(2, 11)
km_results = {}

for k in K_RANGE:
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = km.fit_predict(X)
    sil = silhouette_score(X, labels)
    km_results[k] = {"inertia": km.inertia_, "silhouette": sil, "labels": labels, "model": km}
    print(f"  k={k:2d}: inertia={km.inertia_:,.0f}  silhouette={sil:.4f}")

best_k = max(K_RANGE, key=lambda k: km_results[k]["silhouette"])
print(f"\nBest k by silhouette: {best_k} (score={km_results[best_k]['silhouette']:.4f})")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ks = list(K_RANGE)
ax1.plot(ks, [km_results[k]["inertia"] for k in ks], "o-", color=PALETTE[0], lw=2)
ax1.set_xlabel("Number of clusters (k)", fontsize=12)
ax1.set_ylabel("Inertia", fontsize=12)
ax1.set_title("Elbow Method", fontsize=13, fontweight="bold")
ax1.set_xticks(ks)

sils = [km_results[k]["silhouette"] for k in ks]
ax2.plot(ks, sils, "o-", color=PALETTE[1], lw=2)
ax2.axvline(best_k, color="red", ls="--", lw=1, label=f"Best k={best_k}")
ax2.scatter([best_k], [km_results[best_k]["silhouette"]], color="red", s=100, zorder=5)
ax2.set_xlabel("Number of clusters (k)", fontsize=12)
ax2.set_ylabel("Silhouette Score", fontsize=12)
ax2.set_title("Silhouette Analysis", fontsize=13, fontweight="bold")
ax2.set_xticks(ks)
ax2.legend()

plt.suptitle("K-Means: Optimal Number of Clusters", fontsize=15, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(NB_DIR / "kmeans_elbow_silhouette.png", dpi=150, bbox_inches="tight")
plt.show()

## 2. K-Means Cluster Profiles (Radar Charts)

In [None]:
def radar_chart(centers, labels_pretty, cluster_names, title, save_path, palette):
    """Create a radar chart for cluster centers."""
    n_clusters = len(centers)
    n_dims = len(labels_pretty)
    angles = np.linspace(0, 2 * np.pi, n_dims, endpoint=False).tolist()
    angles += angles[:1]  # close the polygon

    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    ax.set_thetagrids(np.degrees(angles[:-1]), labels_pretty, fontsize=11)

    for i, (center, name) in enumerate(zip(centers, cluster_names)):
        values = center.tolist() + center[:1].tolist()
        ax.plot(angles, values, "o-", lw=2.5, color=palette[i % len(palette)], label=name)
        ax.fill(angles, values, alpha=0.1, color=palette[i % len(palette)])

    ax.set_ylim(-2, 2)
    ax.set_title(title, fontsize=14, fontweight="bold", pad=30)
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1), fontsize=10)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches="tight")
    plt.show()

# Fit at best_k
km_best = km_results[best_k]["model"]
km_labels = km_results[best_k]["labels"]
centers_scaled = km_best.cluster_centers_  # in standardized space

# Transform back to quality-score space for interpretability
centers_orig = scaler.inverse_transform(centers_scaled)

# Auto-name clusters based on dominant qualities
def auto_name_cluster(center, style_cols):
    """Name a cluster based on its strongest quality dimensions."""
    sorted_dims = np.argsort(center)[::-1]
    top = style_cols[sorted_dims[0]].replace('_', ' ').title()
    if center[sorted_dims[0]] > 0.5:
        return f"High {top}"
    elif center[sorted_dims[-1]] < -0.5:
        bot = style_cols[sorted_dims[-1]].replace('_', ' ').title()
        return f"Low {bot}"
    else:
        return f"Balanced ({top})"

cluster_names_km = []
print(f"K-Means (k={best_k}) Cluster Profiles:\n")
for i in range(best_k):
    n = (km_labels == i).sum()
    name = auto_name_cluster(centers_orig[i], np.array(style_cols))
    cluster_names_km.append(f"C{i}: {name}")
    print(f"  Cluster {i} — \"{name}\" (n={n:,})")
    for j, col in enumerate(style_cols):
        bar = "█" * max(0, int((centers_orig[i][j] + 2) * 5))
        print(f"    {col:30s}: {centers_orig[i][j]:+.3f}  {bar}")
    print()

radar_chart(centers_orig, LABELS_PRETTY, cluster_names_km,
            f"K-Means Team Style Clusters (k={best_k})",
            NB_DIR / f"kmeans_radar_k{best_k}.png", PALETTE)

## 3. Hierarchical Clustering

In [None]:
# Dendrogram on a subsample (Ward linkage)
np.random.seed(42)
sample_n = min(5000, len(X))
sample_idx = np.random.choice(len(X), sample_n, replace=False)
X_sample = X[sample_idx]

Z = linkage(X_sample, method="ward")

fig, ax = plt.subplots(figsize=(16, 6))
dendrogram(Z, truncate_mode="lastp", p=30, ax=ax, color_threshold=0,
           above_threshold_color=PALETTE[0])
ax.set_title("Hierarchical Clustering Dendrogram (Ward, n=5000 sample)", fontsize=14, fontweight="bold")
ax.set_xlabel("Cluster size")
ax.set_ylabel("Distance")
plt.tight_layout()
plt.savefig(NB_DIR / "hierarchical_dendrogram.png", dpi=150, bbox_inches="tight")
plt.show()

# Fit hierarchical at best_k on full data
hc = AgglomerativeClustering(n_clusters=best_k)
hc_labels = hc.fit_predict(X)
hc_sil = silhouette_score(X, hc_labels)
print(f"\nHierarchical (k={best_k}): silhouette = {hc_sil:.4f}")
print(f"Cluster sizes: {np.bincount(hc_labels)}")

## 4. Gaussian Mixture Models

In [None]:
gmm_bic = {}
gmm_aic = {}
gmm_models = {}

for k in K_RANGE:
    gmm = GaussianMixture(n_components=k, random_state=42, n_init=3)
    gmm.fit(X)
    gmm_bic[k] = gmm.bic(X)
    gmm_aic[k] = gmm.aic(X)
    gmm_models[k] = gmm

best_k_gmm = min(gmm_bic, key=gmm_bic.get)
print(f"Best k by BIC: {best_k_gmm}")

fig, ax = plt.subplots(figsize=(10, 5))
ks = list(K_RANGE)
ax.plot(ks, [gmm_bic[k] for k in ks], "o-", color=PALETTE[0], lw=2, label="BIC")
ax.plot(ks, [gmm_aic[k] for k in ks], "s--", color=PALETTE[1], lw=2, label="AIC")
ax.axvline(best_k_gmm, color="red", ls="--", lw=1)
ax.set_xlabel("Number of components", fontsize=12)
ax.set_ylabel("Information Criterion", fontsize=12)
ax.set_title("GMM: Model Selection (BIC/AIC)", fontsize=13, fontweight="bold")
ax.set_xticks(ks)
ax.legend()
plt.tight_layout()
plt.savefig(NB_DIR / "gmm_model_selection.png", dpi=150, bbox_inches="tight")
plt.show()

# Fit GMM at best_k (using kmeans best_k for comparison)
gmm_final = gmm_models[best_k]
gmm_labels = gmm_final.predict(X)
gmm_proba = gmm_final.predict_proba(X)
gmm_sil = silhouette_score(X, gmm_labels)
print(f"\nGMM (k={best_k}): silhouette = {gmm_sil:.4f}")
print(f"Cluster sizes: {np.bincount(gmm_labels)}")

# Show soft assignment example
print(f"\nSoft assignment example (first 5 teams):")
for i in range(5):
    probs = gmm_proba[i]
    print(f"  Team {i}: " + "  ".join([f"C{j}={p:.2f}" for j, p in enumerate(probs)]))

## 5. Method Comparison

In [None]:
km_sil = km_results[best_k]["silhouette"]

# Adjusted Rand Index between methods
ari_km_hc = adjusted_rand_score(km_labels, hc_labels)
ari_km_gmm = adjusted_rand_score(km_labels, gmm_labels)
ari_hc_gmm = adjusted_rand_score(hc_labels, gmm_labels)

print(f"{'Method':<20} {'Silhouette':>12} {'k':>5}")
print("-" * 40)
print(f"{'K-Means':<20} {km_sil:>12.4f} {best_k:>5}")
print(f"{'Hierarchical':<20} {hc_sil:>12.4f} {best_k:>5}")
print(f"{'GMM':<20} {gmm_sil:>12.4f} {best_k:>5}")
print(f"\nAdjusted Rand Index (agreement between methods):")
print(f"  K-Means vs Hierarchical: {ari_km_hc:.4f}")
print(f"  K-Means vs GMM:          {ari_km_gmm:.4f}")
print(f"  Hierarchical vs GMM:     {ari_hc_gmm:.4f}")

# PCA scatter comparison
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
all_labels = [(km_labels, "K-Means"), (hc_labels, "Hierarchical"), (gmm_labels, "GMM")]

for ax, (labels, name) in zip(axes, all_labels):
    for c in range(best_k):
        mask = labels == c
        ax.scatter(X_pca[mask, 0], X_pca[mask, 1], alpha=0.1, s=5,
                   color=PALETTE[c % len(PALETTE)], label=f"C{c}")
    sil = silhouette_score(X, labels)
    ax.set_title(f"{name} (sil={sil:.3f})", fontsize=13, fontweight="bold")
    ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%})")
    ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%})")
    ax.legend(markerscale=3, fontsize=9)

plt.suptitle(f"Clustering Method Comparison (k={best_k})", fontsize=15, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(NB_DIR / "method_comparison.png", dpi=150, bbox_inches="tight")
plt.show()

## 6. Final Cluster Interpretation & Export

Use the best-performing method, interpret clusters, and save.

In [None]:
# Pick best method
methods = {"kmeans": (km_labels, km_sil), "hierarchical": (hc_labels, hc_sil), "gmm": (gmm_labels, gmm_sil)}
best_method = max(methods, key=lambda m: methods[m][1])
final_labels, final_sil = methods[best_method]
print(f"Best method: {best_method} (silhouette={final_sil:.4f})")

# Compute cluster means in original quality space
result_df = qdf.loc[valid_idx].copy()
result_df["cluster_id"] = final_labels

print(f"\n{'='*70}")
print(f"FINAL CLUSTER PROFILES (k={best_k}, method={best_method})")
print(f"{'='*70}")

cluster_summaries = []
for c_id in range(best_k):
    mask = result_df["cluster_id"] == c_id
    n = mask.sum()
    means = result_df.loc[mask, style_cols].mean()
    
    # Characterize
    top_pos = means.nlargest(2)
    top_neg = means.nsmallest(1)
    
    desc_parts = []
    for dim, val in top_pos.items():
        if val > 0.3:
            desc_parts.append(f"high {dim.replace('_', ' ')}")
    for dim, val in top_neg.items():
        if val < -0.3:
            desc_parts.append(f"low {dim.replace('_', ' ')}")
    
    desc = ", ".join(desc_parts) if desc_parts else "balanced"
    
    cluster_summaries.append({
        "cluster_id": c_id, "n_teams": n, "description": desc,
        **{col: means[col] for col in style_cols}
    })
    
    print(f"\nCluster {c_id} — \"{desc}\" (n={n:,}, {n/len(result_df)*100:.1f}%)")
    for col in style_cols:
        val = means[col]
        direction = "→" if abs(val) < 0.2 else ("↑" if val > 0 else "↓")
        print(f"    {col:30s}: {val:+.3f} {direction}")

summary_df = pd.DataFrame(cluster_summaries)
print(f"\nSummary table:")
summary_df

In [None]:
# Assign cluster names
cluster_name_map = {row["cluster_id"]: row["description"] for _, row in summary_df.iterrows()}
result_df["cluster_name"] = result_df["cluster_id"].map(cluster_name_map)

# Save
out_path = BASE / "Teams_stats" / "team_style_clusters.parquet"
result_df.to_parquet(out_path, index=False)
print(f"Saved: {out_path}")
print(f"Shape: {result_df.shape}")
print(f"Columns: {list(result_df.columns)}")

## 7. Publication-Quality Radar Chart

In [None]:
# Final radar with cluster means
centers_final = np.array([result_df.loc[result_df["cluster_id"]==i, style_cols].mean().values
                          for i in range(best_k)])
names_final = [f"{cluster_name_map[i]}" for i in range(best_k)]

radar_chart(centers_final, LABELS_PRETTY, names_final,
            f"Team Playing Style Clusters (k={best_k}, {best_method})",
            NB_DIR / "final_cluster_radar.png", PALETTE)

print(f"\n{'='*60}")
print(f"DONE — {best_k} team style clusters identified.")
print(f"Output: team_style_clusters.parquet")
print(f"{'='*60}")