# Computing Team Tactical DNA

Compute the 6+1 tactical quality scores for every team-season in our dataset.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# -- Paths --
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        RAW = d / "thesis_data" / "raw_data"
        PROCESSED = d / "thesis_data" / "processed_data"
        break

ts = pd.read_parquet(RAW / "Teams_stats" / "team_stats_season.parquet")
comps = pd.read_parquet(RAW / "Wyscout" / "competitions_wyscout.parquet")
div1_ids = comps[comps["division"] == 1]["competition_id"].unique()
ts = ts[ts["competition_id"].isin(div1_ids)].copy()

print(f"Loaded: {len(ts):,} team-seasons from {ts.competition_id.nunique()} top-flight leagues")
print(f"Seasons: {ts.season.min()}\u2013{ts.season.max()}")


## The Seven Tactical Qualities

In [None]:
# -- Define the seven tactical qualities --
QUALITIES = {
    "Defence": {
        "metrics": {
            "defensive_intensity": {"weight": 1.0, "higher_is_better": True},
            "ppda": {"weight": 1.0, "higher_is_better": False},
            "final_third_recoveries_pct": {"weight": 1.0, "higher_is_better": True},
            "defensive_action_height_m": {"weight": 1.0, "higher_is_better": True},
        },
    },
    "Def. Transition": {
        "metrics": {
            "recoveries_within_5s_pct": {"weight": 1.0, "higher_is_better": True},
            "time_to_defensive_action_after_loss_att_half_s": {"weight": 2.0, "higher_is_better": False},
            "time_to_defensive_action_after_loss_own_half_s": {"weight": 1.0, "higher_is_better": False},
        },
    },
    "Att. Transition": {
        "metrics": {
            "possessions_retained_after_5s_pct": {"weight": 0.5, "higher_is_better": False},
            "final_third_entry_within_10s_after_recovery_own_half_pct": {"weight": 0.5, "higher_is_better": True},
            "first_pass_forward_after_recovery_own_half_pct": {"weight": 1.0, "higher_is_better": True},
            "median_time_to_first_forward_pass_own_half_s": {"weight": 0.5, "higher_is_better": False},
        },
    },
    "Attack": {
        "metrics": {
            "long_ball_pct": {"weight": 2.0, "higher_is_better": True},
            "forward_passes_from_middle_third_pct": {"weight": 1.0, "higher_is_better": True},
            "buildups_from_goalkicks_pct": {"weight": 1.0, "higher_is_better": False},
        },
    },
    "Penetration": {
        "metrics": {
            "box_entries_from_carries_pct": {"weight": 2.0, "higher_is_better": True},
            "box_entries_from_crosses_pct": {"weight": 2.0, "higher_is_better": False},
            "crosses_per_final_third_possession": {"weight": 1.0, "higher_is_better": False},
        },
    },
    "Chance Creation": {
        "metrics": {
            "shots_per_final_third_pass": {"weight": 1.0, "higher_is_better": True},
            "shots_from_direct_attacks_pct": {"weight": 2.0, "higher_is_better": True},
            "shots_from_sustained_attacks_pct": {"weight": 2.0, "higher_is_better": False},
        },
    },
    "Outcome": {
        "metrics": {
            "xpts": {"weight": 1.5, "higher_is_better": True},
            "points": {"weight": 1.0, "higher_is_better": True},
        },
    },
}

all_metrics = []
for q_info in QUALITIES.values():
    all_metrics.extend(q_info["metrics"].keys())
all_metrics = list(set(all_metrics))

missing = [m for m in all_metrics if m not in ts.columns]
if missing:
    print(f"WARNING -- missing columns: {missing}")

def compute_group_zscores(group, metrics):
    if len(group) < 3:
        return None
    result = group.copy()
    for m in metrics:
        if m in result.columns:
            col = result[m]
            mu, sigma = col.mean(), col.std()
            if sigma > 0:
                result[f"z_{m}"] = (col - mu) / sigma
            else:
                result[f"z_{m}"] = 0.0
    return result

groups = []
for (cid, season), grp in ts.groupby(["competition_id", "season"]):
    out = compute_group_zscores(grp, all_metrics)
    if out is not None:
        groups.append(out)

qdf = pd.concat(groups, ignore_index=True)
print(f"After z-scoring: {len(qdf):,} team-seasons")

for quality_name, q_info in QUALITIES.items():
    total_w = 0.0
    qdf[quality_name] = 0.0
    for metric, meta in q_info["metrics"].items():
        zcol = f"z_{metric}"
        if zcol not in qdf.columns:
            continue
        sign = 1.0 if meta["higher_is_better"] else -1.0
        w = meta["weight"]
        qdf[quality_name] += sign * w * qdf[zcol]
        total_w += w
    if total_w > 0:
        qdf[quality_name] /= total_w

style_dims = [q for q in QUALITIES if q != "Outcome"]
print(f"Style dimensions: {style_dims}")
print(f"Sample quality scores:\n{qdf[list(QUALITIES.keys())].describe().round(2)}")


## Export

In [None]:
out_path = PROCESSED / "team_styles" / "team_qualities.parquet"
out_path.parent.mkdir(parents=True, exist_ok=True)
qdf.to_parquet(out_path, index=False)
print(f"\u2705 Saved: {out_path}")
print(f"   Shape: {qdf.shape}")
