# Ingest & Clean

Load raw CSVs from `data/raw/`, fix dtypes, remove duplicates, handle missing values.

In [1]:
# Setup: import utils and display paths
from pathlib import Path
import pandas as pd
from src.utils.ingest import load_raw_csvs, save_interim
from src.utils.paths import RAW_DIR, INTERIM_DIR
RAW_DIR, INTERIM_DIR

(WindowsPath('C:/Users/javie/Documents/personal_projects/fm-ml/data/raw'),
 WindowsPath('C:/Users/javie/Documents/personal_projects/fm-ml/data/interim'))

In [2]:
# Load all raw CSVs in data/raw
raw = load_raw_csvs()
list(raw.keys()), {k: v.shape for k, v in raw.items()}

([], {})

In [3]:
# Example cleaning function you can adapt
def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # Standardize column names
    out.columns = [c.strip().lower().replace(" ", "_") for c in out.columns]
    # Drop exact duplicates
    out = out.drop_duplicates()
    return out

# If a 'players' CSV exists, clean and preview
if 'players' in raw:
    players_clean = basic_clean(raw['players'])
    players_clean.head()

In [4]:
# Save cleaned data to interim as Parquet
if 'players' in globals():
    out_path = save_interim(players_clean, 'players_cleaned')
    out_path, out_path.exists()

## Next steps
- Build features in `01_feature_build.ipynb` using `src/features/`.
- Train a baseline in `02_match_outcome_baseline.ipynb`.

In [19]:
# Load FM HTML exports
from src.utils.ingest import load_raw_htmls, parse_fm_html_table
html = load_raw_htmls()
list(html.keys()), {k: v.shape for k, v in html.items()}

(['squad_1', 'squad_mental', 'squad_physical', 'squad_technical'],
 {'squad_1': (29, 35),
  'squad_mental': (29, 18),
  'squad_physical': (29, 12),
  'squad_technical': (29, 32)})

In [16]:
# Save squad table (first HTML) to interim
if html:
    first_key = sorted(html.keys())[0]
    squad_df = html[first_key]
    out_path = save_interim(squad_df, f"{first_key}_cleaned")
    out_path

In [22]:
# Merge multiple squad tables into a single per-player dataset
from src.features.squad import merge_squad_tables
if html:
    squad_merged = merge_squad_tables(html)
    squad_merged.shape, squad_merged.columns[:10].tolist()

In [24]:
# Save merged squad features
from src.utils.paths import FEATURES_DIR
if 'squad_merged' in globals():
    # Debug: check for duplicates
    print("Columns:", list(squad_merged.columns))
    print("Duplicates:", squad_merged.columns.duplicated().sum())
    
    # Manual fix: make columns unique
    cols = squad_merged.columns.tolist()
    seen = {}
    unique_cols = []
    for c in cols:
        if c in seen:
            seen[c] += 1
            unique_cols.append(f"{c}__dup{seen[c]}")
        else:
            seen[c] = 0
            unique_cols.append(c)
    squad_merged.columns = unique_cols
    
    FEATURES_DIR.mkdir(parents=True, exist_ok=True)
    feat_path = FEATURES_DIR / 'squad_features.parquet'
    squad_merged.to_parquet(feat_path, index=False)
    feat_path, feat_path.exists()

Columns: ['age', 'apps', 'mins', 'av_rat', 'Player', 'gl_mst', 'int_90', 'pres_c_90', 'tck_r', 'tck_90', 'tcon_90', 'k_hdrs', 'hdrs_w_90', 'cr_c_a', 'ch_c_90', 'op_kp_90', 'drb_90', 'asts_90', 'op_kp', 'ast', 'xa', 'tgls_90', 'pts_gm', 'xg', 'gls', 'xg_shot', 'gls_90', 'sht_90', 'shot', 'ability', 'potential', 'det', 'personality', 'apps', 'apps_subs']
Duplicates: 1
