# 02 — Explore & Validate Team Style Qualities

Explores the 6 style qualities + outcome computed in notebook 01. Checks correlations, distributions, relationship with results, and season stability.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        NB_DIR = d / "thesis_data" / "notebooks" / "team_styles"
        break

qdf = pd.read_parquet(BASE / "Teams_stats" / "team_qualities.parquet")
ts = pd.read_parquet(BASE / "Teams_stats" / "team_stats_season.parquet")

style_cols = ["defence", "defensive_transition", "attacking_transition", "attack", "penetration", "chance_creation"]
all_q_cols = style_cols + ["outcome"]

PALETTE = ["#1A237E", "#1565C0", "#1976D2", "#1E88E5", "#42A5F5", "#546E7A", "#78909C"]

print(f"Loaded: {len(qdf):,} team-seasons | {qdf.team_id.nunique():,} teams | {qdf.competition_id.nunique()} competitions")
qdf[all_q_cols].describe().round(3)

## 1. Correlation Between Style Qualities

Ideally the 6 style dimensions should be relatively independent — high correlation would indicate redundant information.

In [None]:
corr = qdf[style_cols].corr()

fig, ax = plt.subplots(figsize=(9, 7))
mask = np.triu(np.ones_like(corr, dtype=bool), k=1)
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, annot=True, fmt=".2f",
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax,
            xticklabels=[c.replace('_', ' ').title() for c in style_cols],
            yticklabels=[c.replace('_', ' ').title() for c in style_cols])
ax.set_title("Correlation Between Team Style Qualities", fontsize=14, fontweight="bold", pad=15)
plt.tight_layout()
plt.savefig(NB_DIR / "style_correlations.png", dpi=150, bbox_inches="tight")
plt.show()

# Flag high correlations
high_corr = []
for i in range(len(style_cols)):
    for j in range(i+1, len(style_cols)):
        r = corr.iloc[i, j]
        if abs(r) > 0.4:
            high_corr.append((style_cols[i], style_cols[j], r))
if high_corr:
    print("Notable correlations (|r| > 0.4):")
    for a, b, r in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
        print(f"  {a} ↔ {b}: r={r:.3f}")
else:
    print("All correlations below |0.4| — qualities are reasonably independent.")

## 2. Style vs Outcome

Which playing styles correlate with better results?

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(style_cols):
    ax = axes[i]
    ax.scatter(qdf[col], qdf["outcome"], alpha=0.03, s=5, color=PALETTE[i])
    
    # Pearson r
    mask = qdf[[col, "outcome"]].dropna().index
    r = qdf.loc[mask, col].corr(qdf.loc[mask, "outcome"])
    ax.set_title(f"{col.replace('_', ' ').title()}\nr = {r:.3f}", fontsize=12, fontweight="bold")
    ax.set_xlabel("Style quality score")
    ax.set_ylabel("Outcome quality")
    ax.axhline(0, color="gray", lw=0.5, ls="--")
    ax.axvline(0, color="gray", lw=0.5, ls="--")

plt.suptitle("Style Qualities vs Outcome (Team Performance)", fontsize=15, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(NB_DIR / "style_vs_outcome.png", dpi=150, bbox_inches="tight")
plt.show()

## 3. PCA of Style Qualities

Project the 6 style dimensions into 2D to see the overall landscape of team styles.

In [None]:
X = qdf[style_cols].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=min(6, len(style_cols)))
X_pca = pca.fit_transform(X_scaled)

print("PCA Explained Variance:")
cumvar = 0
for i, ev in enumerate(pca.explained_variance_ratio_):
    cumvar += ev
    print(f"  PC{i+1}: {ev:.1%} (cumulative: {cumvar:.1%})")

# 2D scatter colored by outcome
fig, ax = plt.subplots(figsize=(12, 9))
outcome_vals = qdf.loc[X.index, "outcome"].values
sc = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=outcome_vals, cmap="RdYlBu_r",
                alpha=0.15, s=8, vmin=-2, vmax=2)
plt.colorbar(sc, ax=ax, label="Outcome (team performance)", shrink=0.8)

# Draw loading arrows
loadings = pca.components_[:2].T
scale = 3
for j, col in enumerate(style_cols):
    ax.annotate("", xy=(loadings[j, 0]*scale, loadings[j, 1]*scale), xytext=(0, 0),
                arrowprops=dict(arrowstyle="->", color="#1A237E", lw=2))
    ax.text(loadings[j, 0]*scale*1.12, loadings[j, 1]*scale*1.12,
            col.replace('_', '\n').title(), fontsize=9, fontweight="bold",
            ha="center", va="center", color="#1A237E")

ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)", fontsize=12)
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)", fontsize=12)
ax.set_title("PCA of Team Style Qualities (colored by outcome)", fontsize=14, fontweight="bold")
ax.axhline(0, color="gray", lw=0.3)
ax.axvline(0, color="gray", lw=0.3)
plt.tight_layout()
plt.savefig(NB_DIR / "pca_style_qualities.png", dpi=150, bbox_inches="tight")
plt.show()

## 4. Season Stability

How much do team styles change year-to-year? Low change = stable identity, high change = tactical shifts.

In [None]:
# For teams with >= 3 seasons in same competition, compute YoY changes
qdf_sorted = qdf.sort_values(["team_id", "competition_id", "season"])

# Count appearances
team_comp_counts = qdf_sorted.groupby(["team_id", "competition_id"]).size()
valid_tc = team_comp_counts[team_comp_counts >= 3].index
print(f"Team-competition pairs with ≥3 seasons: {len(valid_tc):,}")

# Calculate year-over-year differences
yoy_diffs = {c: [] for c in style_cols}
for (tid, cid), grp in qdf_sorted.groupby(["team_id", "competition_id"]):
    if (tid, cid) not in valid_tc:
        continue
    grp = grp.sort_values("season")
    for c in style_cols:
        diffs = grp[c].diff().dropna().values
        yoy_diffs[c].extend(diffs)

fig, axes = plt.subplots(2, 3, figsize=(16, 9))
axes = axes.flatten()
for i, col in enumerate(style_cols):
    ax = axes[i]
    vals = np.array(yoy_diffs[col])
    ax.hist(vals, bins=50, color=PALETTE[i], alpha=0.8, edgecolor="white")
    ax.axvline(0, color="red", lw=1, ls="--")
    ax.set_title(f"{col.replace('_', ' ').title()}\nμ={vals.mean():.3f}, σ={vals.std():.3f}",
                 fontsize=11, fontweight="bold")
    ax.set_xlabel("Year-over-year change")

plt.suptitle("Season-to-Season Style Stability (teams with ≥3 seasons)", fontsize=14, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(NB_DIR / "style_stability.png", dpi=150, bbox_inches="tight")
plt.show()

print("\nAverage absolute YoY change per quality:")
for c in style_cols:
    vals = np.abs(yoy_diffs[c])
    print(f"  {c:30s}: {np.mean(vals):.3f} (median: {np.median(vals):.3f})")

## 5. Quality Profiles by Competition Size

In [None]:
# Add team count per competition-season
comp_sizes = qdf.groupby(["competition_id", "season"]).size().reset_index(name="n_teams")
qdf_aug = qdf.merge(comp_sizes, on=["competition_id", "season"])
qdf_aug["comp_tier"] = pd.cut(qdf_aug["n_teams"], bins=[0, 10, 18, 24, 100],
                               labels=["Small (≤10)", "Medium (11-18)", "Large (19-24)", "Very large (25+)"])

fig, axes = plt.subplots(2, 3, figsize=(16, 9))
axes = axes.flatten()
for i, col in enumerate(style_cols):
    ax = axes[i]
    data_by_tier = [qdf_aug.loc[qdf_aug["comp_tier"] == t, col].dropna().values
                    for t in qdf_aug["comp_tier"].cat.categories]
    bp = ax.boxplot(data_by_tier, labels=[str(t) for t in qdf_aug["comp_tier"].cat.categories],
                    patch_artist=True, showfliers=False)
    for patch, color in zip(bp["boxes"], PALETTE[:4]):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    ax.set_title(col.replace('_', ' ').title(), fontsize=11, fontweight="bold")
    ax.tick_params(axis='x', rotation=20)

plt.suptitle("Style Quality Distributions by Competition Size", fontsize=14, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(NB_DIR / "quality_by_comp_size.png", dpi=150, bbox_inches="tight")
plt.show()

## 6. Summary

In [None]:
print("="*60)
print("EXPLORATION SUMMARY")
print("="*60)
print(f"\n• Dataset: {len(qdf):,} team-seasons across {qdf.competition_id.nunique()} competitions")
print(f"• 6 style qualities + 1 outcome quality")
print(f"• PCA: PC1+PC2 explain {sum(pca.explained_variance_ratio_[:2]):.1%} of style variance")
print(f"• Season stability: styles change moderately year-to-year")
print(f"• Next step: cluster teams based on these 6 style dimensions → notebook 03")