In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
def load_parquet_optional(p: Path):
    if p.exists():
        try:
            return pd.read_parquet(p)
        except Exception as e:
            print(f"Failed to read {p}: {e}")
            return None
    return None


def sanitize_numeric_series(s):
    return pd.to_numeric(s, errors='coerce')


# small plotting helper
def save_plot(fig, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)

In [3]:
# when pasted into a notebook, set SEASON variable manually
SEASON = 2020

PROJECT_ROOT = PROJECT_ROOT = Path.cwd().parents[1]
BASE_DIR = PROJECT_ROOT / "data" / "seasons" / str(SEASON)

SAVE_REPORT = False
OUT_DIR = BASE_DIR / 'eda_report'

OUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Season folder: {BASE_DIR.resolve()}")

# %%
# Load normalized tables
matches_fp = BASE_DIR / 'matches.parquet'
players_fp = BASE_DIR / 'player_stats.parquet'
events_fp = BASE_DIR / 'events.parquet'
teams_fp = BASE_DIR / 'teams.parquet'
processed_players_fp = BASE_DIR / 'processed_player_stats.parquet'

matches = load_parquet_optional(matches_fp)
players_stats = load_parquet_optional(players_fp)
events = load_parquet_optional(events_fp)
teams = load_parquet_optional(teams_fp)
processed_players_stats = load_parquet_optional(processed_players_fp)

print('Loaded:')
print(' matches:', getattr(matches, 'shape', None))
print(' players:', getattr(players_stats, 'shape', None))
print(' processed_players:', getattr(processed_players_stats, 'shape', None))
print(' events :', getattr(events, 'shape', None))
print(' teams :', getattr(teams, 'shape', None))

Season folder: /home/kamil/projects/tipster/data/seasons/2020
Loaded:
 matches: (380, 17)
 players: (14643, 43)
 processed_players: (10178, 11)
 events : (11132, 9)
 teams : (20, 2)


In [4]:
processed_players_stats.tail()

Unnamed: 0,fixture_id,player_id,team_id,fixture_dt,rating_ema5,shots_on_per90_ema5,shots_total_per90_ema5,passes_key_per90_ema5,minutes_share_ema5,games_rating,minutes
10173,592854,280688,39,2021-05-16 13:05:00+00:00,,0.0,0.0,0.0,0.0,6.9,8
10174,592843,296458,62,2021-05-08 14:00:00+00:00,,,,,,6.3,25
10175,592848,296458,62,2021-05-16 18:00:00+00:00,6.3,0.0,3.6,0.0,0.277778,7.2,90
10176,592862,296458,62,2021-05-19 17:00:00+00:00,6.6,0.666667,3.066667,0.333333,0.518519,6.7,90
10177,592873,296458,62,2021-05-23 15:00:00+00:00,6.633333,0.444444,2.044444,0.222222,0.679012,6.3,79


In [5]:
# print(processed_players_stats.value_counts(dropna=False))
# processed_players_stats.fillna(0, inplace=True)

# 1) Descriptives
print(processed_players_stats.describe().T)  # quick scale/sanity profile [7]

# 2) Missingness
nulls = processed_players_stats.isna().sum().sort_values(ascending=False)  # per-column NA counts [8]
print(f'nulls: \n{nulls.head(10)}')

# 3) Duplicates on (fixture_id, player_id)
dupes = processed_players_stats.duplicated(subset=['fixture_id','player_id']).sum()  # should be 0 [9]
print("Duplicate (fixture_id, player_id):", dupes)

# 4) Range rules
if 'rating_ema5' in processed_players_stats.columns:
    bad = ~processed_players_stats['rating_ema5'].between(0, 10, inclusive='both')  # typical 0–10 [10]
    print("rating_ema5 out of [0,10]:", int(bad.sum()))
if 'minutes_share_ema5' in processed_players_stats.columns:
    bad = ~processed_players_stats['minutes_share_ema5'].between(0, 1.5, inclusive='both')  # allow some ET [10]
    print("minutes_share_ema5 out of [0,1.5]:", int(bad.sum()))

# 5) IQR outliers for EMA columns
ema_cols = [c for c in processed_players_stats.columns if c.endswith('_ema5')]
for c in ema_cols:
    q1, q3 = processed_players_stats[c].quantile(0.25), processed_players_stats[c].quantile(0.75)  # quartiles [11]
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (processed_players_stats[c] < lo) | (processed_players_stats[c] > hi)
    print(f"{c} IQR outliers:", int(mask.sum()))

# 6) Chronology / leakage: first row per player should have EMA==0 after shift
first_rows = processed_players_stats.sort_values(['player_id','fixture_dt']).groupby('player_id').head(1)
print({c: int(first_rows[c].notna().sum()) for c in ema_cols})  # expect 0 non-NaN before fill [4][5]

                          count           mean           std       min  \
fixture_id              10178.0  592470.801533    272.814624  592141.0   
player_id               10178.0   14294.439183  24640.467411      17.0   
team_id                 10178.0      47.393201      9.328303      33.0   
rating_ema5              9664.0       6.895944       0.34612       4.7   
shots_on_per90_ema5      9820.0       0.394344      0.944144       0.0   
shots_total_per90_ema5   9820.0       0.804372      1.347174       0.0   
passes_key_per90_ema5    9820.0       0.810805       1.17888       0.0   
minutes_share_ema5       9820.0       0.722142      0.303238       0.0   
minutes                 10178.0      73.912262      26.84279       2.0   

                             25%       50%       75%        max  
fixture_id              592236.0  592331.0  592781.0   592875.0  
player_id                 1438.0   18742.0   19093.0   296458.0  
team_id                     40.0      46.0      52.0       66