# Validación final y resumen del dataset de Transfers

Este notebook abre el parquet final, verifica integridad y genera una vista de resumen
con la estructura, conteo de columnas y cobertura por sección.

In [None]:
# -- Paths (resolve Unicode dir name dynamically) --
from pathlib import Path
docs = Path("/Users/jorgepadilla/Documents")
for _d in docs.iterdir():
    if "Jorge" in _d.name and "MacBook" in _d.name and _d.is_dir():
        RAW = _d / "thesis_data" / "raw_data"
        PROCESSED = _d / "thesis_data" / "processed_data"
        break

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pathlib import Path

base = RAW / "Transfers"
df = pd.read_parquet(base / "male_transfers_model_2018_2025.parquet")
print(f"Shape: {df.shape}")
print(f"Memory: {df.memory_usage(deep=True).sum() / 1024**2:.0f} MB")

## 1. Integridad: sin duplicados, sin artifacts

In [None]:
# Columnas duplicadas
duped_cols = df.columns[df.columns.duplicated()].tolist()
print(f"Columnas duplicadas:       {duped_cols if duped_cols else 'Ninguna ✅'}")

# Merge artifacts
xy_cols = [c for c in df.columns if c.endswith("_x") or c.endswith("_y")]
print(f"Artifacts de merge (_x/_y): {xy_cols if xy_cols else 'Ninguno ✅'}")

# Rows duplicadas
n_dupes = df.duplicated().sum()
print(f"Rows duplicadas:           {n_dupes if n_dupes else 'Ninguna ✅'}")

## 2. Clasificación de columnas

In [None]:
def categorize_columns(df):
    """Clasifica cada columna en su sección lógica."""
    cats = {}
    for c in df.columns:
        if c.startswith("from_team_stats_opp_"):
            cats.setdefault("from_team_opp_stats", []).append(c)
        elif c.startswith("from_team_stats_"):
            cats.setdefault("from_team_stats", []).append(c)
        elif c.startswith("from_comp_"):
            cats.setdefault("from_comp_meta", []).append(c)
        elif c.startswith("from_"):
            # Sub-classify player cols
            s = c[5:]  # strip 'from_'
            if s in ["team_id", "competition", "season", "position", "Minutes"]:
                cats.setdefault("from_player_meta", []).append(c)
            elif s.startswith("z_score_"):
                cats.setdefault("from_player_zscore", []).append(c)
            elif "per 90" in s:
                cats.setdefault("from_player_per90", []).append(c)
            else:
                cats.setdefault("from_player_raw", []).append(c)
        elif c.startswith("to_team_stats_opp_"):
            cats.setdefault("to_team_opp_stats", []).append(c)
        elif c.startswith("to_team_stats_"):
            cats.setdefault("to_team_stats", []).append(c)
        elif c.startswith("to_comp_"):
            cats.setdefault("to_comp_meta", []).append(c)
        elif c.startswith("to_"):
            s = c[3:]  # strip 'to_'
            if s in ["team_id", "competition", "season", "position", "Minutes"]:
                cats.setdefault("to_player_meta", []).append(c)
            elif s.startswith("z_score_"):
                cats.setdefault("to_player_zscore", []).append(c)
            elif "per 90" in s:
                cats.setdefault("to_player_per90", []).append(c)
            else:
                cats.setdefault("to_player_raw", []).append(c)
        elif c.startswith("wyscout_"):
            cats.setdefault("wyscout_player", []).append(c)
        else:
            cats.setdefault("player_meta", []).append(c)
    return cats

cats = categorize_columns(df)

# Ordered display
section_order = [
    "player_meta", "wyscout_player",
    "from_player_meta", "from_comp_meta", "from_player_raw", "from_player_per90", "from_player_zscore",
    "from_team_stats", "from_team_opp_stats",
    "to_player_meta", "to_comp_meta", "to_player_raw", "to_player_per90", "to_player_zscore",
    "to_team_stats", "to_team_opp_stats",
]

total = 0
section_data = []
print(f"{'Sección':<25s} {'# Cols':>6s} {'Cobertura':>10s}  {'Rango'}")
print("-" * 65)
for sec in section_order:
    cols = cats.get(sec, [])
    n = len(cols)
    cov = df[cols[0]].notna().sum() / len(df) * 100 if cols else 0
    rng = f"[{total:>3d} - {total+n-1:>3d}]"
    print(f"{sec:<25s} {n:>6d} {cov:>9.1f}%  {rng}")
    section_data.append((sec, n, cov))
    total += n
print("-" * 65)
print(f"{'TOTAL':<25s} {total:>6d}")

## 3. Validación de espejos FROM ↔ TO

In [None]:
mirrors = [
    ("Player meta",   "from_player_meta",    "to_player_meta",    5, 5),
    ("Comp meta",     "from_comp_meta",      "to_comp_meta",      8, 8),
    ("Player raw",    "from_player_raw",     "to_player_raw",     50, 50),
    ("Player per 90", "from_player_per90",   "to_player_per90",   46, 46),
    ("Player zscore", "from_player_zscore",  "to_player_zscore",  75, 75),
    ("Team stats",    "from_team_stats",     "to_team_stats",     75, 75),
    ("Team opp stats","from_team_opp_stats", "to_team_opp_stats", 74, 74),
]

all_ok = True
for label, fk, tk, exp_f, exp_t in mirrors:
    f_cols = cats.get(fk, [])
    t_cols = cats.get(tk, [])
    # Extract suffixes
    f_suf = sorted([c.split("_", 1)[1] if fk.startswith("from_player") else c.replace("from_", "", 1) for c in f_cols])
    t_suf = sorted([c.split("_", 1)[1] if tk.startswith("to_player") else c.replace("to_", "", 1) for c in t_cols])
    
    count_ok = len(f_cols) == exp_f and len(t_cols) == exp_t
    match_ok = f_suf == t_suf
    status = "✅" if count_ok and match_ok else "❌"
    if not (count_ok and match_ok):
        all_ok = False
    print(f"{status} {label:<18s}  from={len(f_cols):>3d}  to={len(t_cols):>3d}  mirror={match_ok}")

if all_ok:
    print("\n✅ Todos los bloques FROM/TO son espejos perfectos con conteos correctos")

## 4. Resumen de cobertura por sección

In [None]:
# Cobertura detallada por bloque con sample de columnas
for sec in section_order:
    cols = cats.get(sec, [])
    if not cols:
        continue
    cov = df[cols[0]].notna().sum() / len(df) * 100
    print(f"\n{'='*50}")
    print(f"{sec} ({len(cols)} cols, {cov:.1f}% coverage)")
    print(f"{'='*50}")
    for c in cols[:5]:
        print(f"  {c}")
    if len(cols) > 5:
        print(f"  ... ({len(cols) - 5} more)")

## 5. Guardar parquet final limpio

In [None]:
# Reordenar según el section_order definido arriba
ordered = []
for sec in section_order:
    ordered.extend(cats.get(sec, []))

assert set(ordered) == set(df.columns), f"Mismatch: {set(df.columns) - set(ordered)}"
assert len(ordered) == len(df.columns)

df_final = df[ordered]

out_path = base / "transfers_model_final_2018_2025.parquet"
df_final.to_parquet(out_path, index=False)

print(f"Guardado: {out_path.name}")
print(f"Shape:    {df_final.shape}")
print(f"Tamaño:   {out_path.stat().st_size / 1024 / 1024:.1f} MB")

# Verify
df_v = pd.read_parquet(out_path)
assert list(df_v.columns) == ordered and df_v.shape == df_final.shape
print(f"\n✅ Verificado: {df_v.shape[0]:,} rows × {df_v.shape[1]} cols")

## 6. Diagrama de estructura del dataset

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.patches import FancyBboxPatch

fig, ax = plt.subplots(1, 1, figsize=(18, 13))
ax.set_xlim(0, 100)
ax.set_ylim(0, 100)
ax.axis("off")

# Color palette
C_ROOT    = "#2C3E50"
C_PLAYER  = "#3498DB"
C_FROM    = "#E67E22"
C_TO      = "#27AE60"
C_TEAM    = "#8E44AD"
C_TEXT    = "white"
C_SUBTEXT = "#ECF0F1"

def draw_box(ax, x, y, w, h, text, color, fontsize=9, alpha=0.9, subtext=None):
    box = FancyBboxPatch((x, y), w, h, boxstyle="round,pad=0.3", 
                          facecolor=color, edgecolor="white", linewidth=1.5, alpha=alpha)
    ax.add_patch(box)
    cy = y + h/2 + (1.2 if subtext else 0)
    ax.text(x + w/2, cy, text, ha="center", va="center", fontsize=fontsize,
            fontweight="bold", color=C_TEXT)
    if subtext:
        ax.text(x + w/2, cy - 2.5, subtext, ha="center", va="center", fontsize=7,
                color=C_SUBTEXT, style="italic")

def draw_line(ax, x1, y1, x2, y2, color="#BDC3C7"):
    ax.plot([x1, x2], [y1, y2], color=color, linewidth=1.2, zorder=0)

# ========== ROOT ==========
draw_box(ax, 22, 90, 56, 7,
         f"transfers_model_final_2018_2025.parquet",
         C_ROOT, fontsize=12,
         subtext=f"{len(df_final):,} rows  ×  {len(df_final.columns)} cols  |  {out_path.stat().st_size/1024**2:.0f} MB")

# ========== LEVEL 1: Player Identity + FROM + TO ==========
# Player identity
n_pm = len(cats["player_meta"])
n_ws = len(cats["wyscout_player"])
cov_ws = df[cats['wyscout_player'][0]].notna().sum() / len(df) * 100

draw_line(ax, 50, 90, 50, 83)
draw_line(ax, 50, 83, 15, 83)
draw_line(ax, 50, 83, 50, 83)  
draw_line(ax, 50, 83, 85, 83)

draw_line(ax, 15, 83, 15, 79)
draw_line(ax, 50, 83, 50, 79)
draw_line(ax, 85, 83, 85, 79)

# Player Identity block
draw_box(ax, 1, 72, 28, 7,
         f"Player Identity",
         C_PLAYER, fontsize=10,
         subtext=f"{n_pm} core + {n_ws} wyscout = {n_pm+n_ws} cols | wyscout: {cov_ws:.0f}%")

# FROM block
draw_box(ax, 33, 72, 34, 7,
         f"FROM  (origin club/season)",
         C_FROM, fontsize=10,
         subtext=f"{sum(len(cats[k]) for k in cats if k.startswith('from_'))} cols total")

# TO block
draw_box(ax, 70, 72, 29, 7,
         f"TO  (destination club/season)",
         C_TO, fontsize=10,
         subtext=f"{sum(len(cats[k]) for k in cats if k.startswith('to_'))} cols total")

# ========== LEVEL 2: FROM children ==========
from_children_x = [34.5, 42.5, 50.5, 58.5]
from_center = 50
for x in from_children_x:
    draw_line(ax, from_center, 72, x, 67)
    draw_line(ax, x, 67, x, 64)

from_sections = [
    ("from_player_meta",   "Metadata",    5, C_FROM),
    ("from_comp_meta",     "Comp Meta",   8, C_FROM),
    ("from_player_raw",    "Player\nStats",  None, C_FROM),
    ("from_team_stats",    "Team\nStats",    None, C_FROM),
]

for i, (key, label, forced_n, color) in enumerate(from_sections):
    x = from_children_x[i]
    cols = cats.get(key, [])
    n = forced_n if forced_n else len(cols)
    cov = df[cols[0]].notna().sum() / len(df) * 100 if cols else 0
    draw_box(ax, x - 3.5, 57, 7.5, 7,
             label, color, fontsize=7, alpha=0.7,
             subtext=f"{n}c | {cov:.0f}%")

# ========== LEVEL 3: FROM player stats breakdown ==========
px = 50.5
player_sub_y = 50
player_subs_x = [44, 50.5, 57]
for sx in player_subs_x:
    draw_line(ax, px, 57, sx, player_sub_y + 5.5)

player_subsections = [
    ("from_player_raw",    "Raw",   50),
    ("from_player_per90",  "Per90", 46),
    ("from_player_zscore", "Z-score", 75),
]
for i, (key, label, n) in enumerate(player_subsections):
    x = player_subs_x[i]
    draw_box(ax, x - 2.8, player_sub_y, 5.6, 5.5,
             label, C_FROM, fontsize=7, alpha=0.55,
             subtext=f"{n}c")

# ========== LEVEL 3: FROM team stats breakdown ==========
tx = 58.5
team_subs_x = [55.5, 61.5]
for sx in team_subs_x:
    draw_line(ax, tx, 57, sx, player_sub_y + 5.5)

team_subsections = [
    ("from_team_stats",     "Team",  75),
    ("from_team_opp_stats", "Opp",   74),
]
for i, (key, label, n) in enumerate(team_subsections):
    x = team_subs_x[i]
    cov = df[cats[key][0]].notna().sum() / len(df) * 100
    draw_box(ax, x - 2.5, player_sub_y, 5, 5.5,
             label, C_TEAM, fontsize=7, alpha=0.55,
             subtext=f"{n}c | {cov:.0f}%")

# ========== LEVEL 2: TO children (mirror) ==========
to_children_x = [71.5, 79, 86.5, 94]
to_center = 85
for x in to_children_x:
    draw_line(ax, to_center, 72, x, 67)
    draw_line(ax, x, 67, x, 64)

to_sections = [
    ("to_player_meta",   "Metadata",    5, C_TO),
    ("to_comp_meta",     "Comp Meta",   8, C_TO),
    ("to_player_raw",    "Player\nStats",  None, C_TO),
    ("to_team_stats",    "Team\nStats",    None, C_TO),
]

for i, (key, label, forced_n, color) in enumerate(to_sections):
    x = to_children_x[i]
    cols = cats.get(key, [])
    n = forced_n if forced_n else len(cols)
    cov = df[cols[0]].notna().sum() / len(df) * 100 if cols else 0
    draw_box(ax, x - 3.5, 57, 7.5, 7,
             label, color, fontsize=7, alpha=0.7,
             subtext=f"{n}c | {cov:.0f}%")

# ========== LEVEL 3: TO player stats breakdown ==========
tpx = 86.5
tp_subs_x = [80, 86.5, 93]
for sx in tp_subs_x:
    draw_line(ax, tpx, 57, sx, player_sub_y + 5.5)

tp_subsections = [
    ("to_player_raw",    "Raw",   50),
    ("to_player_per90",  "Per90", 46),
    ("to_player_zscore", "Z-score", 75),
]
for i, (key, label, n) in enumerate(tp_subsections):
    x = tp_subs_x[i]
    draw_box(ax, x - 2.8, player_sub_y, 5.6, 5.5,
             label, C_TO, fontsize=7, alpha=0.55,
             subtext=f"{n}c")

# ========== LEVEL 3: TO team stats breakdown ==========
ttx = 94
tt_subs_x = [91, 97]
for sx in tt_subs_x:
    draw_line(ax, ttx, 57, sx, player_sub_y + 5.5)

tt_subsections = [
    ("to_team_stats",     "Team",  75),
    ("to_team_opp_stats", "Opp",   74),
]
for i, (key, label, n) in enumerate(tt_subsections):
    x = tt_subs_x[i]
    cov = df[cats[key][0]].notna().sum() / len(df) * 100
    draw_box(ax, x - 2.5, player_sub_y, 5, 5.5,
             label, C_TO, fontsize=7, alpha=0.55,
             subtext=f"{n}c | {cov:.0f}%")

# ========== LEVEL 2: Player Identity children ==========
pi_children_x = [8, 22]
pi_center = 15
for x in pi_children_x:
    draw_line(ax, pi_center, 72, x, 67)
    draw_line(ax, x, 67, x, 64)

draw_box(ax, 4, 57, 8, 7,
         f"Core IDs", C_PLAYER, fontsize=7, alpha=0.7,
         subtext=f"{n_pm}c | 100%")
draw_box(ax, 18, 57, 8.5, 7,
         f"Wyscout", C_PLAYER, fontsize=7, alpha=0.7,
         subtext=f"{n_ws}c | {cov_ws:.0f}%")

# ========== LEGEND ==========
legend_items = [
    mpatches.Patch(color=C_PLAYER, label="Player Identity"),
    mpatches.Patch(color=C_FROM, label="FROM (origin)"),
    mpatches.Patch(color=C_TO, label="TO (destination)"),
    mpatches.Patch(color=C_TEAM, label="Team Season Stats"),
]
ax.legend(handles=legend_items, loc="lower left", fontsize=9, framealpha=0.9,
          facecolor="#2C3E50", edgecolor="white", labelcolor="white")

# Note
ax.text(50, 44, "c = columns  |  % = non-null coverage  |  FROM and TO are perfect structural mirrors",
        ha="center", fontsize=8, color="#7F8C8D", style="italic")

plt.tight_layout()
plt.savefig("dataset_structure.png", dpi=150, bbox_inches="tight", facecolor="white")
plt.show()
print("Saved: dataset_structure.png")

## 7. Resumen ejecutivo

In [None]:
print(f"""
╔══════════════════════════════════════════════════════════════════╗
║  transfers_model_final_2018_2025.parquet                       ║
║  {df_final.shape[0]:,} rows × {df_final.shape[1]} cols | {out_path.stat().st_size/1024**2:.0f} MB                        ║
╠══════════════════════════════════════════════════════════════════╣
║                                                                ║
║  PLAYER IDENTITY          {n_pm+n_ws:>3d} cols                            ║
║    Core IDs & meta        {n_pm:>3d}  (100% coverage)                  ║
║    Wyscout metadata       {n_ws:>3d}  ({cov_ws:.0f}% coverage)                   ║
║                                                                ║
║  FROM (origin)            {sum(len(cats[k]) for k in cats if k.startswith('from_')):>3d} cols                            ║
║    Player meta              {len(cats['from_player_meta']):>2d}  (IDs, position, minutes)       ║
║    Competition meta          {len(cats['from_comp_meta']):>1d}  (name, country, division...)    ║
║    Player raw metrics       {len(cats['from_player_raw']):>2d}  (absolutes, %, indices)        ║
║    Player per 90            {len(cats['from_player_per90']):>2d}  (normalized metrics)           ║
║    Player z-scores          {len(cats['from_player_zscore']):>2d}  (standardized)                ║
║    Team season stats        {len(cats['from_team_stats']):>2d}  (team performance avg)         ║
║    Team opp season stats    {len(cats['from_team_opp_stats']):>2d}  (opponents performance avg)   ║
║                                                                ║
║  TO (destination)         {sum(len(cats[k]) for k in cats if k.startswith('to_')):>3d} cols  ← perfect mirror of FROM   ║
║                                                                ║
║  Seasons: 2018 - 2025                                         ║
║  Transfer types: same_competition + different_competition      ║
╚══════════════════════════════════════════════════════════════════╝
""")