# 01 — Calculate Team Style Qualities

Implements the supervisor's methodology: for each team-season, calculate **6 style qualities + 1 outcome quality** as weighted averages of z-scores computed **within each competition-season**.

Source: `teams_qualities.md` (Twelve Football internal methodology)

**Output:** `team_qualities.parquet` — one row per (team, competition, season) with 7 quality scores.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Dynamic path resolution (handles Unicode apostrophe)
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        break

ts = pd.read_parquet(BASE / "Teams_stats" / "team_stats_season.parquet")
print(f"Loaded: {ts.shape[0]:,} team-seasons  |  {ts.team_id.nunique():,} teams  |  {ts.competition_id.nunique()} competitions  |  seasons {ts.season.min()}-{ts.season.max()}")

## 1. Define Quality Formulas

Each quality is a weighted average of z-scores. Metrics where **lower is better** get their z-scores negated before weighting (so that a positive quality score always means "more of that style").

Quality directions:
- **DEFENCE:** low = compact low block → high = aggressive high press
- **DEFENSIVE TRANSITION:** low = drop back → high = counter-press
- **ATTACKING TRANSITION:** low = retain & build up → high = quick counter-attack
- **ATTACK:** low = patient build-up → high = direct long balls
- **PENETRATION:** low = wide crossing → high = progressive carrying
- **CHANCE CREATION:** low = sustained possession → high = direct/fast chances
- **OUTCOME:** low = poor results → high = strong results (separate from style)

In [None]:
# higher_is_better: True means a high raw value pushes the quality HIGHER
#                    False means the z-score gets negated (lower raw = higher quality)

QUALITIES = {
    "defence": {
        "description": "Low block ← → High press",
        "metrics": {
            "defensive_intensity":       {"weight": 1.0, "higher_is_better": True},
            "ppda":                      {"weight": 1.0, "higher_is_better": False},  # lower ppda = more pressing
            "final_third_recoveries_pct": {"weight": 1.0, "higher_is_better": True},
            "defensive_action_height_m": {"weight": 1.0, "higher_is_better": True},
        },
    },
    "defensive_transition": {
        "description": "Drop back ← → Counter-press",
        "metrics": {
            "recoveries_within_5s_pct":                          {"weight": 1.0, "higher_is_better": True},
            "time_to_defensive_action_after_loss_att_half_s":    {"weight": 2.0, "higher_is_better": False},  # lower = faster
            "time_to_defensive_action_after_loss_own_half_s":    {"weight": 1.0, "higher_is_better": False},
        },
    },
    "attacking_transition": {
        "description": "Retain & build up ← → Quick counter-attack",
        "metrics": {
            "possessions_retained_after_5s_pct":                          {"weight": 0.5, "higher_is_better": False},  # high retention = NOT counter
            "final_third_entry_within_10s_after_recovery_own_half_pct":   {"weight": 0.5, "higher_is_better": True},
            "first_pass_forward_after_recovery_own_half_pct":             {"weight": 1.0, "higher_is_better": True},
            "median_time_to_first_forward_pass_own_half_s":               {"weight": 0.5, "higher_is_better": False},  # lower = faster
        },
    },
    "attack": {
        "description": "Patient build-up ← → Direct long balls",
        "metrics": {
            "long_ball_pct":                       {"weight": 2.0, "higher_is_better": True},
            "forward_passes_from_middle_third_pct": {"weight": 1.0, "higher_is_better": True},
            "buildups_from_goalkicks_pct":          {"weight": 1.0, "higher_is_better": False},  # high = build from back = NOT direct
        },
    },
    "penetration": {
        "description": "Wide crossing ← → Progressive carrying",
        "metrics": {
            "box_entries_from_carries_pct":      {"weight": 2.0, "higher_is_better": True},
            "box_entries_from_crosses_pct":      {"weight": 2.0, "higher_is_better": False},  # high crosses = NOT carrying
            "crosses_per_final_third_possession": {"weight": 1.0, "higher_is_better": False},
        },
    },
    "chance_creation": {
        "description": "Sustained possession ← → Direct fast chances",
        "metrics": {
            "shots_per_final_third_pass":      {"weight": 1.0, "higher_is_better": True},
            "shots_from_direct_attacks_pct":   {"weight": 2.0, "higher_is_better": True},
            "shots_from_sustained_attacks_pct": {"weight": 2.0, "higher_is_better": False},  # high sustained = NOT direct
        },
    },
    "outcome": {
        "description": "Poor results ← → Strong results (not a style)",
        "metrics": {
            "xpts":   {"weight": 1.5, "higher_is_better": True},
            "points": {"weight": 1.0, "higher_is_better": True},
        },
    },
}

# Verify all metrics exist
all_metrics = set()
for q, qdef in QUALITIES.items():
    for m in qdef["metrics"]:
        all_metrics.add(m)
        assert m in ts.columns, f"Missing column: {m}"

print(f"All {len(all_metrics)} metrics verified in dataset.")
for q, qdef in QUALITIES.items():
    metrics = list(qdef['metrics'].keys())
    weights = [qdef['metrics'][m]['weight'] for m in metrics]
    print(f"  {q:25s} — {len(metrics)} metrics, total weight {sum(weights):.1f}")

## 2. Calculate Z-Scores Within Competition-Season

For each metric, standardize within `(competition_id, season)` so that z-scores reflect how a team compares **to others in the same league and season**.

In [None]:
# Check minimum teams per competition-season (need ≥3 for meaningful z-scores)
group_sizes = ts.groupby(["competition_id", "season"]).size()
print(f"Competition-seasons: {len(group_sizes):,}")
print(f"Teams per group: min={group_sizes.min()}, median={group_sizes.median():.0f}, max={group_sizes.max()}")
print(f"Groups with <3 teams: {(group_sizes < 3).sum()} ({(group_sizes < 3).mean()*100:.1f}%)")
print(f"Groups with <5 teams: {(group_sizes < 5).sum()} ({(group_sizes < 5).mean()*100:.1f}%)")

# Filter: require at least 3 teams in a competition-season for z-scores to be meaningful
MIN_TEAMS = 3
valid_groups = group_sizes[group_sizes >= MIN_TEAMS].index
ts_filtered = ts.set_index(["competition_id", "season"]).loc[valid_groups].reset_index()
print(f"\nAfter filtering (≥{MIN_TEAMS} teams): {len(ts_filtered):,} rows ({len(ts_filtered)/len(ts)*100:.1f}%)")

In [None]:
# Calculate z-scores within each (competition_id, season)
metrics_list = sorted(all_metrics)

def zscore_within_group(group, cols):
    """Z-score each metric within a competition-season group."""
    result = group[cols].copy()
    for c in cols:
        mean = result[c].mean()
        std = result[c].std()
        if std > 0:
            result[c] = (result[c] - mean) / std
        else:
            result[c] = 0.0  # no variation in this group
    return result

# Apply groupwise z-scoring
z_scores = ts_filtered.groupby(["competition_id", "season"], group_keys=False).apply(
    zscore_within_group, cols=metrics_list
)

# Attach back to keys
z_df = ts_filtered[["team_id", "competition_id", "season"]].copy()
for c in metrics_list:
    z_df[f"z_{c}"] = z_scores[c].values

print(f"Z-scores computed: {z_df.shape}")
print(f"\nSample z-score distributions (should be ~mean=0, std=1 within each group):")
sample_metrics = ["defensive_intensity", "ppda", "ball_possession_pct"]
for m in sample_metrics:
    vals = z_df[f"z_{m}"]
    print(f"  z_{m}: mean={vals.mean():.4f}, std={vals.std():.3f}, range=[{vals.min():.2f}, {vals.max():.2f}]")

## 3. Calculate Weighted Quality Scores

For each quality: negate z-scores of "lower is better" metrics, then compute weighted average.

In [None]:
quality_df = z_df[["team_id", "competition_id", "season"]].copy()

for q_name, q_def in QUALITIES.items():
    weighted_sum = np.zeros(len(z_df))
    total_weight = 0.0
    
    for metric, mdef in q_def["metrics"].items():
        w = mdef["weight"]
        z_col = z_df[f"z_{metric}"].values
        
        # Negate if lower raw value is "better" for this quality direction
        if not mdef["higher_is_better"]:
            z_col = -z_col
        
        weighted_sum += w * z_col
        total_weight += w
    
    quality_df[q_name] = weighted_sum / total_weight

style_cols = [c for c in quality_df.columns if c not in ["team_id", "competition_id", "season"]]
print(f"Computed {len(style_cols)} qualities for {len(quality_df):,} team-seasons:")
for c in style_cols:
    vals = quality_df[c]
    print(f"  {c:25s}  mean={vals.mean():+.3f}  std={vals.std():.3f}  range=[{vals.min():.2f}, {vals.max():.2f}]")

In [None]:
# Quick sanity check: quality distributions should be roughly centered around 0
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 4, figsize=(20, 8))
axes = axes.flatten()

for i, q in enumerate(style_cols):
    ax = axes[i]
    ax.hist(quality_df[q].dropna(), bins=60, color="#1565C0", alpha=0.8, edgecolor="white")
    ax.set_title(q.replace("_", " ").title(), fontsize=13, fontweight="bold")
    ax.axvline(0, color="red", lw=1, ls="--")
    ax.set_xlabel("Quality score")

# Hide extra subplot
if len(style_cols) < len(axes):
    axes[-1].set_visible(False)

plt.suptitle("Team Quality Score Distributions (z-scored within competition-season)", fontsize=16, fontweight="bold", y=1.02)
plt.tight_layout()
plt.savefig(str(BASE.parent / "notebooks" / "team_styles" / "quality_distributions.png"), dpi=150, bbox_inches="tight")
plt.show()
print("Saved: quality_distributions.png")

## 4. Save Results

In [None]:
# Save full quality dataset
out_path = BASE / "Teams_stats" / "team_qualities.parquet"
quality_df.to_parquet(out_path, index=False)
print(f"Saved: {out_path}")
print(f"Shape: {quality_df.shape}")
print(f"Columns: {list(quality_df.columns)}")
print(f"\nSample (first 10 rows):")
quality_df.head(10)

In [None]:
# Summary stats
print(f"\n{'='*60}")
print(f"TEAM QUALITIES SUMMARY")
print(f"{'='*60}")
print(f"Total team-seasons: {len(quality_df):,}")
print(f"Unique teams: {quality_df.team_id.nunique():,}")
print(f"Unique competitions: {quality_df.competition_id.nunique()}")
print(f"Season range: {quality_df.season.min()}-{quality_df.season.max()}")
print(f"\nStyle qualities (6): {[c for c in style_cols if c != 'outcome']}")
print(f"Outcome quality (1): outcome")
print(f"\nNull check:")
for c in style_cols:
    n_null = quality_df[c].isnull().sum()
    if n_null > 0:
        print(f"  {c}: {n_null} nulls ({n_null/len(quality_df)*100:.2f}%)")
    else:
        print(f"  {c}: 0 nulls")