# Merge de Team Season Stats al dataset de Transfers

Se cruza `Teams_stats/team_stats_season.parquet` con el dataset principal
usando `(team_id, competition_id, season)` tanto para FROM como para TO.

## Estructura del archivo fuente (77 cols, 36,228 rows)

| Rango | Contenido | Acción |
|-------|-----------|--------|
| [0-2] `team_id`, `competition_id`, `season` | Join keys | Usados para el merge |
| [3-76] 74 métricas del equipo | fouls_commited → xg_from_direct_attacks | **Merge** como `from_team_stats_*` y `to_team_stats_*` |

**Total métricas:** 74 por lado (from/to) = **148 columnas nuevas**

**Datos:** 7,529 equipos × 375 competiciones × 13 temporadas (2014-2026), sin duplicados en key.

In [None]:
# -- Paths (resolve Unicode dir name dynamically) --
from pathlib import Path
docs = Path("/Users/jorgepadilla/Documents")
for _d in docs.iterdir():
    if "Jorge" in _d.name and "MacBook" in _d.name and _d.is_dir():
        RAW = _d / "thesis_data" / "raw_data"
        PROCESSED = _d / "thesis_data" / "processed_data"
        break

import pandas as pd
from pathlib import Path



ts = pd.read_parquet(base / "Teams_stats" / "team_stats_season.parquet")
tf = pd.read_parquet(base / "Transfers" / "male_transfers_model_2018_2025.parquet")

# Drop team stats cols if re-running
old = [c for c in tf.columns if "team_stats" in c]
if old:
    tf = tf.drop(columns=old)
    print(f"Dropped {len(old)} existing team stats cols")

print(f"Team stats: {ts.shape}")
print(f"Transfers:  {tf.shape}")

## 1. Validación de estructura

In [None]:
join_keys = ["team_id", "competition_id", "season"]
metrics = ts.columns[3:].tolist()  # 74 métricas

print(f"Join keys:    {len(join_keys)} → {join_keys}")
print(f"Métricas:     {len(metrics)} cols")
print(f"Total:        {len(join_keys) + len(metrics)} / {len(ts.columns)}")
assert len(join_keys) + len(metrics) == len(ts.columns)

# Granularity check
combos = ts.groupby(join_keys).ngroups
print(f"\nRows: {len(ts):,}")
print(f"Unique (team_id, competition_id, season): {combos:,}")
print(f"Duplicates: {ts.duplicated(subset=join_keys).sum()}")
assert combos == len(ts), "Hay duplicados en la key!"

print(f"\nEquipos: {ts['team_id'].nunique():,}")
print(f"Comps:   {ts['competition_id'].nunique()}")
print(f"Seasons: {sorted(ts['season'].unique())}")

In [None]:
# Show all metric columns
print(f"=== 74 TEAM METRICS ===")
for i, c in enumerate(metrics):
    nulls = ts[c].isna().sum()
    null_str = f"  (nulls={nulls})" if nulls > 0 else ""
    print(f"  [{i:>2d}] {c}{null_str}")

## 2. Preparar y mergear

In [None]:
# FROM merge
ts_from = ts.rename(columns={m: f"from_team_stats_{m}" for m in metrics})
ts_from = ts_from.rename(columns={
    "team_id": "from_team_id",
    "competition_id": "from_competition",
    "season": "from_season"
})
ts_from["from_team_id"] = ts_from["from_team_id"].astype("int32")
ts_from["from_competition"] = ts_from["from_competition"].astype("int32")
ts_from["from_season"] = ts_from["from_season"].astype("int16")

n_before = len(tf)
df = tf.merge(ts_from, on=["from_team_id", "from_competition", "from_season"], how="left")
assert len(df) == n_before, f"Row count changed! {n_before} -> {len(df)}"

from_cov = df["from_team_stats_goals"].notna().sum()
print(f"FROM merge: {from_cov:,} / {n_before:,} ({from_cov/n_before*100:.1f}%)")

In [None]:
# TO merge
ts_to = ts.rename(columns={m: f"to_team_stats_{m}" for m in metrics})
ts_to = ts_to.rename(columns={
    "team_id": "to_team_id",
    "competition_id": "to_competition",
    "season": "to_season"
})
ts_to["to_team_id"] = ts_to["to_team_id"].astype("int32")
ts_to["to_competition"] = ts_to["to_competition"].astype("int32")
ts_to["to_season"] = ts_to["to_season"].astype("int16")

df = df.merge(ts_to, on=["to_team_id", "to_competition", "to_season"], how="left")
assert len(df) == n_before, f"Row count changed! {n_before} -> {len(df)}"

to_cov = df["to_team_stats_goals"].notna().sum()
print(f"TO merge:   {to_cov:,} / {n_before:,} ({to_cov/n_before*100:.1f}%)")
print(f"\nShape: {df.shape}")

In [None]:
# Sanity checks
duped_cols = df.columns[df.columns.duplicated()].tolist()
xy_cols = [c for c in df.columns if c.endswith("_x") or c.endswith("_y")]
print(f"Duplicate cols: {duped_cols if duped_cols else 'None'}")
print(f"Merge artifacts: {xy_cols if xy_cols else 'None'}")

# Verify from/to mirror
from_ts_suf = sorted([c.replace("from_team_stats_", "") for c in df.columns if c.startswith("from_team_stats_")])
to_ts_suf = sorted([c.replace("to_team_stats_", "") for c in df.columns if c.startswith("to_team_stats_")])
print(f"from_team_stats: {len(from_ts_suf)} cols")
print(f"to_team_stats:   {len(to_ts_suf)} cols")
print(f"Perfect mirror:  {from_ts_suf == to_ts_suf}")

## 3. Reordenar y guardar

In [None]:
# Reorder columns by section
all_cols = list(df.columns)

meta_cols = []
wyscout_cols = []
from_player_cols = []
from_comp_cols = []
from_ts_cols = []
to_player_cols = []
to_comp_cols = []
to_ts_cols = []

for c in all_cols:
    if c.startswith("from_team_stats_"):
        from_ts_cols.append(c)
    elif c.startswith("from_comp_"):
        from_comp_cols.append(c)
    elif c.startswith("from_"):
        from_player_cols.append(c)
    elif c.startswith("to_team_stats_"):
        to_ts_cols.append(c)
    elif c.startswith("to_comp_"):
        to_comp_cols.append(c)
    elif c.startswith("to_"):
        to_player_cols.append(c)
    elif c.startswith("wyscout_"):
        wyscout_cols.append(c)
    else:
        meta_cols.append(c)

ordered = (meta_cols + wyscout_cols +
           from_player_cols + from_comp_cols + from_ts_cols +
           to_player_cols + to_comp_cols + to_ts_cols)

assert set(ordered) == set(all_cols) and len(ordered) == len(all_cols)
df = df[ordered]

# Section summary
sections = [
    ("player_meta", meta_cols),
    ("wyscout_player", wyscout_cols),
    ("from_player", from_player_cols),
    ("from_comp_meta", from_comp_cols),
    ("from_team_stats", from_ts_cols),
    ("to_player", to_player_cols),
    ("to_comp_meta", to_comp_cols),
    ("to_team_stats", to_ts_cols),
]

idx = 0
print(f"{'Sección':<25s} {'Rango':<14s} {'#':>4s}  {'Coverage':>8s}")
print("-" * 56)
for name, cols in sections:
    cov = df[cols[0]].notna().sum() / len(df) * 100
    print(f"{name:<25s} [{idx:>3d} - {idx+len(cols)-1:>3d}]  {len(cols):>4d}  {cov:>7.1f}%")
    idx += len(cols)
print("-" * 56)
print(f"{'TOTAL':<25s} {'':14s} {idx:>4d}")

In [None]:
out_path = base / "Transfers" / "male_transfers_model_2018_2025.parquet"
df.to_parquet(out_path, index=False)

print(f"Guardado: {out_path}")
print(f"Shape:    {df.shape}")
print(f"Tamaño:   {out_path.stat().st_size / 1024 / 1024:.1f} MB")

# Verify
df_check = pd.read_parquet(out_path)
assert list(df_check.columns) == ordered and df_check.shape == df.shape
print(f"Verificado: {df_check.shape[0]:,} rows × {df_check.shape[1]} cols")