# 01 â€” Prepare Player Data for Sub-Role Clustering

This notebook loads the transfer dataset, de-duplicates player-seasons, and prepares two feature sets for clustering:

- **Path A**: 20 Player Qualities (pre-computed by Twelve Football)
- **Path B**: 75 Per-90 Z-Score columns

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Dynamic path resolution
docs = Path("/Users/jorgepadilla/Documents")
for d in docs.iterdir():
    if "Jorge" in d.name and "MacBook" in d.name and d.is_dir():
        BASE = d / "thesis_data" / "raw_data"
        NB_DIR = d / "thesis_data" / "notebooks" / "player_subroles"
        break

NB_DIR.mkdir(parents=True, exist_ok=True)

# Load the main transfer dataset
transfer_path = BASE / "Transfers" / "transfers_model_v2_2018_2025.parquet"
df = pd.read_parquet(transfer_path)
print(f"Loaded dataset: {df.shape[0]:,} rows x {df.shape[1]} columns")
print(f"\nPosition distribution (from_position):")
print(df["from_position"].value_counts().sort_values(ascending=False))

## 1. De-duplicate Player-Seasons

In [None]:
# Count unique transfer rows vs unique player-team-season combinations
n_total = len(df)
n_unique_combos = df.groupby(["wy_player_id", "from_team_id", "from_season"]).ngroups
print(f"Total transfer rows:                   {n_total:,}")
print(f"Unique (player, team, season) combos:  {n_unique_combos:,}")
print(f"Duplicate rows to drop:                {n_total - n_unique_combos:,}")
print()

# Before de-duplication counts per position
print("--- BEFORE de-duplication ---")
before_counts = df["from_position"].value_counts().sort_values(ascending=False)
print(before_counts)
print()

# De-duplicate: keep first occurrence per (wy_player_id, from_team_id, from_season)
df_dedup = df.drop_duplicates(subset=["wy_player_id", "from_team_id", "from_season"], keep="first").copy()
print("--- AFTER de-duplication ---")
after_counts = df_dedup["from_position"].value_counts().sort_values(ascending=False)
print(after_counts)
print(f"\nTotal after de-duplication: {len(df_dedup):,}")

# Filter to minimum 500 minutes
df_dedup = df_dedup[df_dedup["from_Minutes"] >= 500].copy()
print(f"\n--- AFTER >= 500 minutes filter ---")
final_counts = df_dedup["from_position"].value_counts().sort_values(ascending=False)
print(final_counts)
print(f"\nTotal after minutes filter: {len(df_dedup):,}")

## 2. Prepare Path A: 20 Player Qualities

In [None]:
# The 20 player qualities
all_qualities = [
    "Active defence", "Aerial threat", "Box threat", "Chance prevention",
    "Composure", "Defensive heading", "Dribbling", "Effectiveness",
    "Finishing", "Hold-up play", "Intelligent defence", "Involvement",
    "Passing quality", "Poaching", "Pressing", "Progression",
    "Providing teammates", "Run quality", "Territorial dominance", "Winning duels"
]

quality_cols = [f"from_{q}" for q in all_qualities]

# Check which quality columns actually exist in the data
existing_quality_cols = [c for c in quality_cols if c in df_dedup.columns]
missing_quality_cols = [c for c in quality_cols if c not in df_dedup.columns]
print(f"Quality columns found: {len(existing_quality_cols)} / {len(quality_cols)}")
if missing_quality_cols:
    print(f"Missing: {missing_quality_cols}")

# Define position-specific quality expectations
# Universal qualities (expected for all outfield positions)
universal_qualities = [
    "Active defence", "Aerial threat", "Box threat", "Composure",
    "Defensive heading", "Dribbling", "Effectiveness", "Finishing",
    "Hold-up play", "Intelligent defence", "Involvement", "Passing quality",
    "Pressing", "Progression", "Providing teammates", "Run quality", "Winning duels"
]

# Position-specific additions
cb_fb_extras = ["Chance prevention", "Territorial dominance"]
striker_winger_extras = ["Poaching"]

positions = ["Central Defender", "Full Back", "Midfielder", "Winger", "Striker", "Goalkeeper"]

# Determine coverage (>80% non-null) per position per quality
print("\n--- Coverage Analysis (% non-null per position per quality) ---")
coverage_threshold = 0.80

position_quality_features = {}
for pos in positions:
    pos_data = df_dedup[df_dedup["from_position"] == pos]
    n_pos = len(pos_data)
    if n_pos == 0:
        continue
    
    # Check coverage for each quality
    valid_features = []
    print(f"\n{pos} (n={n_pos:,}):")
    for q in all_qualities:
        col = f"from_{q}"
        if col not in df_dedup.columns:
            continue
        coverage = pos_data[col].notna().mean()
        marker = "YES" if coverage >= coverage_threshold else "no"
        print(f"  {q:30s} coverage={coverage:.1%}  [{marker}]")
        if coverage >= coverage_threshold:
            valid_features.append(col)
    
    position_quality_features[pos] = valid_features
    print(f"  => {len(valid_features)} features pass the 80% coverage threshold")

# Build output: keep all relevant columns per player
# Include identifiers + all quality columns (even if NaN for some positions)
id_cols = ["wy_player_id", "from_team_id", "from_season", "from_position", "from_Minutes"]
# Also include player name if available (short_name is the actual column in our dataset)
name_candidates = ["short_name", "from_short_name", "from_player_name", "player_name"]
for nc in name_candidates:
    if nc in df_dedup.columns:
        id_cols.append(nc)
        break

out_cols = id_cols + existing_quality_cols
df_qualities = df_dedup[out_cols].copy()

# Save
out_path = NB_DIR / "player_data_qualities.parquet"
df_qualities.to_parquet(out_path, index=False)
print(f"\nSaved: {out_path}")
print(f"Shape: {df_qualities.shape}")

## 3. Prepare Path B: 75 Per-90 Z-Scores

In [None]:
# Extract all z-score columns
zscore_cols = [c for c in df_dedup.columns if c.startswith("from_z_score_")]
print(f"Total z-score columns found: {len(zscore_cols)}")

# Check coverage per position
print("\n--- Z-Score Coverage Analysis ---")

position_zscore_features = {}
for pos in positions:
    pos_data = df_dedup[df_dedup["from_position"] == pos]
    n_pos = len(pos_data)
    if n_pos == 0:
        continue
    
    # Coverage for each z-score column
    coverages = pos_data[zscore_cols].notna().mean()
    valid_zscores = coverages[coverages >= coverage_threshold].index.tolist()
    low_coverage = coverages[coverages < coverage_threshold]
    
    position_zscore_features[pos] = valid_zscores
    print(f"\n{pos} (n={n_pos:,}):")
    print(f"  z-score features with >80% coverage: {len(valid_zscores)} / {len(zscore_cols)}")
    if len(low_coverage) > 0:
        print(f"  Dropped (low coverage): {len(low_coverage)} columns")
        for col_name, cov in low_coverage.sort_values().head(5).items():
            short_name_col = col_name.replace("from_z_score_", "")
            print(f"    {short_name_col:50s} coverage={cov:.1%}")
        if len(low_coverage) > 5:
            print(f"    ... and {len(low_coverage) - 5} more")

# Build output
id_cols_z = ["wy_player_id", "from_team_id", "from_season", "from_position", "from_Minutes"]
name_candidates = ["short_name", "from_short_name", "from_player_name", "player_name"]
for nc in name_candidates:
    if nc in df_dedup.columns:
        id_cols_z.append(nc)
        break

out_cols_z = id_cols_z + zscore_cols
df_zscores = df_dedup[out_cols_z].copy()

# Save
out_path_z = NB_DIR / "player_data_zscores.parquet"
df_zscores.to_parquet(out_path_z, index=False)
print(f"\nSaved: {out_path_z}")
print(f"Shape: {df_zscores.shape}")

## 4. Summary

In [None]:
# Summary table
summary_rows = []
for pos in positions:
    n_players = len(df_dedup[df_dedup["from_position"] == pos])
    n_qual = len(position_quality_features.get(pos, []))
    n_zsc = len(position_zscore_features.get(pos, []))
    summary_rows.append({
        "Position": pos,
        "N Players (>=500 min)": n_players,
        "Quality Features (Path A)": n_qual,
        "Z-Score Features (Path B)": n_zsc,
    })

summary_df = pd.DataFrame(summary_rows)
print("=" * 75)
print("SUMMARY: Features Available per Position")
print("=" * 75)
print(summary_df.to_string(index=False))
print("\nFiles saved:")
print(f"  - {NB_DIR / 'player_data_qualities.parquet'}")
print(f"  - {NB_DIR / 'player_data_zscores.parquet'}")

# Save the feature mappings for downstream notebooks
import json

feature_map = {
    "position_quality_features": {pos: cols for pos, cols in position_quality_features.items()},
    "position_zscore_features": {pos: cols for pos, cols in position_zscore_features.items()},
}
with open(NB_DIR / "feature_maps.json", "w") as f:
    json.dump(feature_map, f, indent=2)
print(f"  - {NB_DIR / 'feature_maps.json'}")