# Alineaci√≥n y reordenamiento de columnas ‚Äî Transfers Dataset

Partimos de `male_transfers_model_2018_2025.parquet` (262,340 rows).

**Objetivos:**
1. Verificar que las 176 columnas `from_` y 176 `to_` son espejos exactos
2. Identificar y eliminar columnas redundantes (`team_id_to`, `competition_to`, `season_to`)
3. Reordenar en bloques l√≥gicos y guardar parquet limpio (360 cols)

In [None]:
# -- Paths (resolve Unicode dir name dynamically) --
from pathlib import Path
docs = Path("/Users/jorgepadilla/Documents")
for _d in docs.iterdir():
    if "Jorge" in _d.name and "MacBook" in _d.name and _d.is_dir():
        RAW = _d / "thesis_data" / "raw_data"
        PROCESSED = _d / "thesis_data" / "processed_data"
        break

import pandas as pd
from pathlib import Path

base = RAW / "Transfers"
df = pd.read_parquet(base / "male_transfers_model_2018_2025.parquet")
print(f"Shape: {df.shape}")

## 1. Verificaci√≥n de alineaci√≥n from_ ‚Üî to_

In [None]:
from_cols = sorted([c for c in df.columns if c.startswith("from_")])
to_cols   = sorted([c for c in df.columns if c.startswith("to_")])

from_suffixes = sorted([c.replace("from_", "", 1) for c in from_cols])
to_suffixes   = sorted([c.replace("to_", "", 1) for c in to_cols])

print(f"from_ columns: {len(from_cols)}")
print(f"to_   columns: {len(to_cols)}")
print(f"\nSuffixes match perfectly: {from_suffixes == to_suffixes}")

only_from = sorted(set(from_suffixes) - set(to_suffixes))
only_to   = sorted(set(to_suffixes) - set(from_suffixes))
if only_from:
    print(f"\n‚ö†Ô∏è Only in from_: {only_from}")
if only_to:
    print(f"\n‚ö†Ô∏è Only in to_: {only_to}")
if not only_from and not only_to:
    print("\n‚úÖ Todas las 176 m√©tricas from_ tienen su espejo exacto en to_")

## 2. Clasificaci√≥n de sufijos en categor√≠as

In [None]:
def classify_suffix(s):
    """Clasifica un sufijo en: meta, per90, zscore, raw"""
    if s in ["team_id", "competition", "season", "position", "Minutes"]:
        return "meta"
    if s.startswith("z_score_"):
        return "zscore"
    if "per 90" in s:
        return "per90"
    return "raw"

categories = {}
for s in from_suffixes:
    cat = classify_suffix(s)
    categories.setdefault(cat, []).append(s)

for cat in ["meta", "raw", "per90", "zscore"]:
    items = categories.get(cat, [])
    print(f"\n{'='*60}")
    print(f"{cat.upper()} ({len(items)} columnas)")
    print(f"{'='*60}")
    for s in sorted(items):
        print(f"  {s}")

## 3. Columnas globales (metadata del jugador y del transfer)

In [None]:
meta_global = [c for c in df.columns if not c.startswith("from_") and not c.startswith("to_")]
print(f"Columnas globales ({len(meta_global)}):")
for c in meta_global:
    print(f"  {c:30s} dtype={str(df[c].dtype):20s} nulls={df[c].isna().sum():>6,}")

## 4. Tabla de verificaci√≥n from ‚Üî to lado a lado

In [None]:
alignment_rows = []
for s in sorted(from_suffixes):
    cat = classify_suffix(s)
    fc = f"from_{s}"
    tc = f"to_{s}"
    alignment_rows.append({
        "suffix": s,
        "category": cat,
        "from_col": fc,
        "from_dtype": str(df[fc].dtype),
        "to_col": tc,
        "to_dtype": str(df[tc].dtype),
        "dtype_match": str(df[fc].dtype) == str(df[tc].dtype)
    })

df_align = pd.DataFrame(alignment_rows)
print(f"Total pares from/to: {len(df_align)}")
print(f"Dtypes coinciden en todos: {df_align['dtype_match'].all()}")

mismatches = df_align[~df_align["dtype_match"]]
if len(mismatches) > 0:
    print(f"\n‚ö†Ô∏è Dtype mismatches ({len(mismatches)}):")
    print(mismatches[["suffix", "from_dtype", "to_dtype"]].to_string(index=False))
else:
    print("\n‚úÖ Todos los pares from/to tienen el mismo dtype")

print("\nüìä Conteo por categor√≠a:")
print(df_align["category"].value_counts().sort_index().to_string())

In [None]:
pd.set_option("display.max_rows", 200)
df_align[["category", "from_col", "to_col", "from_dtype", "dtype_match"]].sort_values(["category", "from_col"])

## 5. Reordenar columnas y guardar parquet limpio

Orden final:

| Bloque | # Cols |
|--------|--------|
| Player metadata (`player_id`, `short_name`, `birth_date`, `player_season_age`, `transfer_type`) | 5 |
| Transfer dates (`competition_start_date`, `first_played_date`, `last_played_date`) | 3 |
| FROM metadata (team_id, competition, season, position, Minutes) | 5 |
| FROM raw metrics | 50 |
| FROM per 90 | 46 |
| FROM z-scores | 75 |
| TO metadata (team_id, competition, season, position, Minutes) | 5 |
| TO raw metrics | 50 |
| TO per 90 | 46 |
| TO z-scores | 75 |
| **TOTAL** | **360** |

In [None]:
# Build ordered column list
player_meta = ["player_id", "short_name", "birth_date", "player_season_age", "transfer_type"]
transfer_dates = ["competition_start_date", "first_played_date", "last_played_date"]

# FROM sections
from_meta = [f"from_{s}" for s in sorted(categories["meta"])]
from_raw  = [f"from_{s}" for s in sorted(categories["raw"])]
from_p90  = [f"from_{s}" for s in sorted(categories["per90"])]
from_z    = [f"from_{s}" for s in sorted(categories["zscore"])]

# TO sections (exact mirror)
to_meta = [f"to_{s}" for s in sorted(categories["meta"])]
to_raw  = [f"to_{s}" for s in sorted(categories["raw"])]
to_p90  = [f"to_{s}" for s in sorted(categories["per90"])]
to_z    = [f"to_{s}" for s in sorted(categories["zscore"])]

ordered_cols = (
    player_meta + transfer_dates +
    from_meta + from_raw + from_p90 + from_z +
    to_meta + to_raw + to_p90 + to_z
)

# Verify all columns accounted for
missing = set(df.columns) - set(ordered_cols)
extra   = set(ordered_cols) - set(df.columns)
print(f"Columnas en df pero no en orden: {missing if missing else '‚àÖ'}")
print(f"Columnas en orden pero no en df: {extra if extra else '‚àÖ'}")
print(f"Total: {len(ordered_cols)} (expected {len(df.columns)})")
assert len(missing) == 0 and len(extra) == 0 and len(ordered_cols) == len(df.columns)
print("\n‚úÖ Todas las columnas incluidas, sin duplicados ni faltantes")

In [None]:
# Reorder
df_ordered = df[ordered_cols].copy()

# Print final structure
sections = [
    ("Player metadata", player_meta),
    ("Transfer dates", transfer_dates),
    ("FROM metadata", from_meta),
    ("FROM raw metrics", from_raw),
    ("FROM per 90", from_p90),
    ("FROM z-scores", from_z),
    ("TO metadata", to_meta),
    ("TO raw metrics", to_raw),
    ("TO per 90", to_p90),
    ("TO z-scores", to_z),
]

col_idx = 0
print(f"{'Bloque':<22s} {'Rango':<14s} {'# Cols':>6s}")
print("-" * 44)
for name, cols in sections:
    start = col_idx
    end = col_idx + len(cols) - 1
    print(f"{name:<22s} [{start:>3d} - {end:>3d}]  {len(cols):>5d}")
    col_idx += len(cols)
print("-" * 44)
print(f"{'TOTAL':<22s} {'':14s} {col_idx:>5d}")

In [None]:
# Verificaci√≥n final: from y to son espejos exactos en cada categor√≠a
for cat_name, f_list, t_list in [
    ("meta", from_meta, to_meta),
    ("raw", from_raw, to_raw),
    ("per90", from_p90, to_p90),
    ("zscore", from_z, to_z)
]:
    f_suf = [c.replace("from_", "", 1) for c in f_list]
    t_suf = [c.replace("to_", "", 1) for c in t_list]
    match = f_suf == t_suf
    print(f"{cat_name:>8s}: {len(f_list)} from == {len(t_list)} to, orden id√©ntico: {match}")

print("\n‚úÖ Estructura from/to perfectamente alineada")

In [None]:
# Guardar
out_path = base / "male_transfers_model_2018_2025.parquet"
df_ordered.to_parquet(out_path, index=False)

print(f"Guardado: {out_path}")
print(f"Shape:    {df_ordered.shape}")
print(f"Tama√±o:   {out_path.stat().st_size / 1024 / 1024:.1f} MB")

# Verify
df_check = pd.read_parquet(out_path)
assert list(df_check.columns) == ordered_cols
assert df_check.shape == df_ordered.shape
print(f"\n‚úÖ Verificado: {df_check.shape[0]:,} rows √ó {df_check.shape[1]} cols, orden correcto")

In [None]:
# Snapshot: primeras columnas de cada secci√≥n
print("Primeras columnas de cada bloque:")
for name, cols in sections:
    preview = cols[:3]
    print(f"\n  {name}:")
    for c in preview:
        print(f"    {c}")
    if len(cols) > 3:
        print(f"    ... ({len(cols) - 3} m√°s)")