# Enriquecimiento con metadata de Wyscout

Dos merges sobre `male_transfers_model_2018_2025.parquet`:

1. **Players** (`players_wyscout.parquet`): left join por `player_id` → 9 columnas `wyscout_*`
2. **Competitions** (`competitions_wyscout.parquet`): left join por `(competition, season)` × 2 (from + to) → 8 `from_comp_*` + 8 `to_comp_*`

Resultado final: **262,340 rows × 385 cols**

In [None]:
# -- Paths (resolve Unicode dir name dynamically) --
from pathlib import Path
docs = Path("/Users/jorgepadilla/Documents")
for _d in docs.iterdir():
    if "Jorge" in _d.name and "MacBook" in _d.name and _d.is_dir():
        RAW = _d / "thesis_data" / "raw_data"
        PROCESSED = _d / "thesis_data" / "processed_data"
        break

import pandas as pd
from pathlib import Path



ws = pd.read_parquet(base / "Wyscout" / "players_wyscout.parquet")
wc = pd.read_parquet(base / "Wyscout" / "competitions_wyscout.parquet")
tf = pd.read_parquet(base / "Transfers" / "male_transfers_model_2018_2025.parquet")

# Drop comp metadata columns if they already exist (re-run safe)
old_comp = [c for c in tf.columns if c.startswith("from_comp_") or c.startswith("to_comp_")]
if old_comp:
    tf = tf.drop(columns=old_comp)
    print(f"Dropped {len(old_comp)} existing comp metadata cols")

print(f"Transfers:    {tf.shape}")
print(f"Players WS:   {ws.shape}")
print(f"Comps WS:     {wc.shape}")

---
## Parte 1: Wyscout Players metadata

In [None]:
print(f"players_wyscout columns ({len(ws.columns)}):")
for c in ws.columns:
    print(f"  {c:20s} dtype={str(ws[c].dtype):10s} nulls={ws[c].isna().sum():>6,}  nunique={ws[c].nunique():>6,}")

In [None]:
# Cobertura por player_id
tf_ids = set(tf["player_id"].unique())
ws_ids = set(ws["player_id"].unique())
overlap = tf_ids & ws_ids

print(f"Jugadores únicos en transfers: {len(tf_ids):,}")
print(f"Jugadores únicos en wyscout:   {len(ws_ids):,}")
print(f"Overlap:                       {len(overlap):,} ({len(overlap)/len(tf_ids)*100:.1f}%)")

rows_match = tf["player_id"].isin(ws_ids).sum()
print(f"\nRows con match:  {rows_match:>7,} / {len(tf):,} ({rows_match/len(tf)*100:.1f}%)")

In [None]:
# Merge player metadata
ws_target = ["player_id", "first_name", "last_name", "height", "weight",
             "passport", "birth_country", "image_url", "foot", "role"]
ws_meta = ws[ws_target].rename(columns={c: f"wyscout_{c}" for c in ws_target if c != "player_id"})

df = tf.merge(ws_meta, on="player_id", how="left")
assert len(df) == len(tf)
print(f"Shape: {df.shape} (added {len(df.columns) - len(tf.columns)} cols)")

In [None]:
print(f"{'Columna':<35s} {'Filled':>7s} / {'Total':>7s}  {'%':>5s}")
print("-" * 60)
for c in [col for col in df.columns if col.startswith("wyscout_")]:
    n = df[c].notna().sum()
    print(f"{c:<35s} {n:>7,} / {len(df):>7,}  {n/len(df)*100:>5.1f}%")

---
## Parte 2: Wyscout Competitions metadata

Cada competición tiene múltiples seasons. Join key compuesto: `(competition_id, season)`.

Se aplica dos veces:
- `from_competition` + `from_season` → columnas `from_comp_*`
- `to_competition` + `to_season` → columnas `to_comp_*`

In [None]:
print(f"competitions_wyscout: {wc.shape}")
print(f"\nGranularity: {wc.groupby(['competition_id','season']).ngroups} unique (comp_id, season) from {len(wc)} rows")
n_dupes = wc.duplicated(subset=['competition_id','season']).sum()
print(f"Duplicates on (comp_id, season): {n_dupes} → {n_dupes} combos con >1 registro")

In [None]:
wc.head(5)

### 2.1 Análisis de los 42 duplicados (competition_id, season)

Se agrupan en 3 patrones:

| Patrón | Casos | Descripción |
|--------|-------|-------------|
| NCAA Spring/Fall | 29 | Ligas universitarias USA con 2 mitades por año |
| COVID suspended | 8 | Seasons 2020 suspendidas + reapertas (Australia, China, Venezuela, Brazil, Zimbabwe) |
| Calendarios split | 5 | Ligas con calendarios año natural partido (Angola, Ghana, India, Nicaragua, Rep. Dominicana, Vietnam, Grecia) |

**Impacto:** Solo 1,938 rows from (0.7%) y 1,301 rows to (0.5%) del dataset de transfers caen en estas combos.

**Criterio de deduplicación:**
- Deprioritizar registros cuyo `season_name` contenga "suspended", "cancelled" o "spring"
- De los restantes, quedarse con el de `end_date` más tardía (season más completa / Fall)

In [None]:
# Mostrar todos los duplicados
dupes = wc[wc.duplicated(subset=["competition_id", "season"], keep=False)].sort_values(["country", "name", "season"])
print(f"Total rows con duplicado: {len(dupes)} ({dupes.groupby(['competition_id','season']).ngroups} combos)\n")

for (cid, s), g in dupes.groupby(["competition_id", "season"]):
    r0 = g.iloc[0]
    print(f"--- {r0['country']} | {r0['name']} | season={s} ---")
    for _, r in g.iterrows():
        print(f"  season_id={r['season_id']:<8}  {r['start_date']} → {r['end_date']}  \"{r['season_name']}\"  completed={r['completed']}")
    print()

In [None]:
# Impacto en transfers
dupe_keys = set(dupes.groupby(["competition_id", "season"]).groups.keys())
dupe_df = pd.DataFrame(list(dupe_keys), columns=["comp", "season"])

from_hits = tf.merge(dupe_df, left_on=["from_competition", "from_season"], right_on=["comp", "season"], how="inner")
to_hits = tf.merge(dupe_df, left_on=["to_competition", "to_season"], right_on=["comp", "season"], how="inner")

print(f"Transfer rows afectadas en FROM: {len(from_hits):,} / {len(tf):,} ({len(from_hits)/len(tf)*100:.2f}%)")
print(f"Transfer rows afectadas en TO:   {len(to_hits):,} / {len(tf):,} ({len(to_hits)/len(tf)*100:.2f}%)")

In [None]:
# Deduplicar
comp_target = ["competition_id", "season", "name", "country", "division",
               "season_id", "start_date", "end_date", "completed", "season_name"]
wc_meta = wc[comp_target].copy()
wc_meta["start_date"] = pd.to_datetime(wc_meta["start_date"])
wc_meta["end_date"] = pd.to_datetime(wc_meta["end_date"])

# Deprioritize suspended/cancelled/spring
drop_keywords = ["suspended", "cancelled", "spring"]
wc_meta["_deprioritize"] = wc_meta["season_name"].str.lower().apply(
    lambda x: any(k in x for k in drop_keywords)
)
wc_meta = wc_meta.sort_values(["_deprioritize", "end_date"], ascending=[True, False])
wc_meta = wc_meta.drop_duplicates(subset=["competition_id", "season"], keep="first")
wc_meta = wc_meta.drop(columns=["_deprioritize"])

print(f"Deduped: {len(wc_meta)} rows (dropped {len(wc) - len(wc_meta)} duplicates)")
assert wc_meta.duplicated(subset=["competition_id", "season"]).sum() == 0

In [None]:
# Verificar que los elegidos son los correctos (no suspended/cancelled/spring)
kept = wc_meta[wc_meta["competition_id"].isin(dupes["competition_id"])]
kept_in_dupes = kept.merge(dupes[["competition_id","season"]].drop_duplicates(), on=["competition_id","season"])
print("Registros elegidos para las 42 combos duplicadas:")
print(kept_in_dupes[["competition_id", "season", "name", "country", "season_name", "start_date", "end_date"]].to_string(index=False))

### 2.2 Merge competitions (FROM + TO)

In [None]:
# FROM join
from_meta = wc_meta.rename(columns={
    c: f"from_comp_{c}" for c in comp_target if c not in ["competition_id", "season"]
})
from_meta = from_meta.rename(columns={"competition_id": "from_competition", "season": "from_season"})
from_meta["from_competition"] = from_meta["from_competition"].astype("int32")
from_meta["from_season"] = from_meta["from_season"].astype("int16")

df = df.merge(from_meta, on=["from_competition", "from_season"], how="left")
print(f"After FROM comp merge: {df.shape}")

# TO join
to_meta = wc_meta.rename(columns={
    c: f"to_comp_{c}" for c in comp_target if c not in ["competition_id", "season"]
})
to_meta = to_meta.rename(columns={"competition_id": "to_competition", "season": "to_season"})
to_meta["to_competition"] = to_meta["to_competition"].astype("int32")
to_meta["to_season"] = to_meta["to_season"].astype("int16")

df = df.merge(to_meta, on=["to_competition", "to_season"], how="left")
print(f"After TO comp merge:   {df.shape}")
assert len(df) == len(tf), "Row count changed!"

In [None]:
# Cobertura
print("=== FROM competition metadata ===")
for c in sorted([c for c in df.columns if c.startswith("from_comp_")]):
    n = df[c].notna().sum()
    print(f"  {c:30s} {n:>7,} / {len(df):,} ({n/len(df)*100:.1f}%)")

print("\n=== TO competition metadata ===")
for c in sorted([c for c in df.columns if c.startswith("to_comp_")]):
    n = df[c].notna().sum()
    print(f"  {c:30s} {n:>7,} / {len(df):,} ({n/len(df)*100:.1f}%)")

---
## 3. Reordenar y guardar

In [None]:
def classify(c):
    s = c.split("_", 1)[1]
    if s in ["team_id", "competition", "season", "position", "Minutes"]:
        return "meta"
    if s.startswith("z_score_"):
        return "zscore"
    if "per 90" in s:
        return "per90"
    return "raw"

player_meta = ["player_id", "short_name", "birth_date", "player_season_age", "transfer_type"]
wyscout_player = sorted([c for c in df.columns if c.startswith("wyscout_")])
transfer_dates = ["competition_start_date", "first_played_date", "last_played_date"]

from_orig = [c for c in tf.columns if c.startswith("from_")]
to_orig = [c for c in tf.columns if c.startswith("to_")]
from_comp = sorted([c for c in df.columns if c.startswith("from_comp_")])
to_comp = sorted([c for c in df.columns if c.startswith("to_comp_")])

from_by_cat = {"meta": [], "raw": [], "per90": [], "zscore": []}
for c in from_orig:
    from_by_cat[classify(c)].append(c)
to_by_cat = {"meta": [], "raw": [], "per90": [], "zscore": []}
for c in to_orig:
    to_by_cat[classify(c)].append(c)

ordered = (
    player_meta + wyscout_player + transfer_dates +
    from_by_cat["meta"] + from_comp + from_by_cat["raw"] + from_by_cat["per90"] + from_by_cat["zscore"] +
    to_by_cat["meta"] + to_comp + to_by_cat["raw"] + to_by_cat["per90"] + to_by_cat["zscore"]
)

assert set(ordered) == set(df.columns) and len(ordered) == len(df.columns)
df_final = df[ordered]
print(f"Final: {df_final.shape}")

In [None]:
blocks = [
    ("Player metadata", player_meta),
    ("Wyscout player", wyscout_player),
    ("Transfer dates", transfer_dates),
    ("FROM metadata", from_by_cat["meta"]),
    ("FROM comp metadata", from_comp),
    ("FROM raw metrics", from_by_cat["raw"]),
    ("FROM per 90", from_by_cat["per90"]),
    ("FROM z-scores", from_by_cat["zscore"]),
    ("TO metadata", to_by_cat["meta"]),
    ("TO comp metadata", to_comp),
    ("TO raw metrics", to_by_cat["raw"]),
    ("TO per 90", to_by_cat["per90"]),
    ("TO z-scores", to_by_cat["zscore"]),
]

idx = 0
print(f"{'Bloque':<25s} {'Rango':<14s} {'#':>4s}")
print("-" * 45)
for name, cols in blocks:
    print(f"{name:<25s} [{idx:>3d} - {idx+len(cols)-1:>3d}]  {len(cols):>4d}")
    idx += len(cols)
print("-" * 45)
print(f"{'TOTAL':<25s} {'':14s} {idx:>4d}")

In [None]:
# Verificar espejo from_comp ↔ to_comp
from_comp_suf = sorted([c.replace("from_comp_", "") for c in from_comp])
to_comp_suf = sorted([c.replace("to_comp_", "") for c in to_comp])
print(f"from_comp_ == to_comp_ suffixes: {from_comp_suf == to_comp_suf}")
print(f"Columns: {from_comp_suf}")

In [None]:
out_path = base / "Transfers" / "male_transfers_model_2018_2025.parquet"
df_final.to_parquet(out_path, index=False)

print(f"Guardado: {out_path}")
print(f"Shape:    {df_final.shape}")
print(f"Tamaño:   {out_path.stat().st_size / 1024 / 1024:.1f} MB")

df_check = pd.read_parquet(out_path)
assert list(df_check.columns) == ordered and df_check.shape == df_final.shape
print(f"\n✅ Verificado: {df_check.shape[0]:,} rows × {df_check.shape[1]} cols")