In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
def load_parquet_optional(p: Path):
    if p.exists():
        try:
            return pd.read_parquet(p)
        except Exception as e:
            print(f"Failed to read {p}: {e}")
            return None
    return None


def sanitize_numeric_series(s):
    return pd.to_numeric(s, errors='coerce')


# small plotting helper
def save_plot(fig, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)

In [4]:
# when pasted into a notebook, set SEASON variable manually
SEASON = 2025

PROJECT_ROOT = PROJECT_ROOT = Path.cwd().parents[1]
BASE_DIR = PROJECT_ROOT / "data" / "seasons" / str(SEASON)

SAVE_REPORT = False
OUT_DIR = BASE_DIR / 'eda_report'

OUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Season folder: {BASE_DIR.resolve()}")

# %%
# Load normalized tables
matches_fp = BASE_DIR / 'matches.parquet'
players_fp = BASE_DIR / 'player_stats.parquet'
events_fp = BASE_DIR / 'events.parquet'
teams_fp = BASE_DIR / 'teams.parquet'
processed_players_fp = BASE_DIR / 'processed_player_stats.parquet'

matches = load_parquet_optional(matches_fp)
players_stats = load_parquet_optional(players_fp)
events = load_parquet_optional(events_fp)
teams = load_parquet_optional(teams_fp)
processed_players_stats = load_parquet_optional(processed_players_fp)

print('Loaded:')
print(' matches:', getattr(matches, 'shape', None))
print(' players:', getattr(players_stats, 'shape', None))
print(' processed_players:', getattr(processed_players_stats, 'shape', None))
print(' events :', getattr(events, 'shape', None))
print(' teams :', getattr(teams, 'shape', None))

Season folder: /home/kamil/projects/tipster/data/seasons/2025
Loaded:
 matches: (320, 17)
 players: (1200, 43)
 processed_players: (902, 12)
 events : (1098, 9)
 teams : (20, 2)


In [5]:
players_stats.columns

Index(['fixture_id', 'player_id', 'player_name', 'team_id', 'games_rating',
       'minutes', 'goals', 'assists', 'yellow_cards', 'red_cards', 'raw',
       'offsides', 'games_minutes', 'games_number', 'games_position',
       'games_captain', 'games_substitute', 'shots_total', 'shots_on',
       'goals_total', 'goals_conceded', 'goals_assists', 'goals_saves',
       'passes_total', 'passes_key', 'passes_accuracy', 'tackles_total',
       'tackles_blocks', 'tackles_interceptions', 'duels_total', 'duels_won',
       'dribbles_attempts', 'dribbles_success', 'dribbles_past', 'fouls_drawn',
       'fouls_committed', 'cards_yellow', 'cards_red', 'penalty_won',
       'penalty_commited', 'penalty_scored', 'penalty_missed',
       'penalty_saved'],
      dtype='object')

In [6]:
processed_players_stats.columns

Index(['fixture_id', 'player_id', 'team_id', 'fixture_dt', 'rating_ema5',
       'shots_on_per90_ema5', 'shots_total_per90_ema5',
       'passes_key_per90_ema5', 'minutes_share_ema5', 'games_rating',
       'minutes', 'games_substitute'],
      dtype='object')

In [12]:
players_stats.raw[35]

'{"player": {"id": 151756, "name": "Will Dennis", "photo": "https://media.api-sports.io/football/players/151756.png"}, "statistics": [{"games": {"minutes": null, "number": 40, "position": "G", "rating": null, "captain": false, "substitute": true}, "offsides": null, "shots": {"total": null, "on": null}, "goals": {"total": null, "conceded": 0, "assists": null, "saves": null}, "passes": {"total": null, "key": null, "accuracy": null}, "tackles": {"total": null, "blocks": null, "interceptions": null}, "duels": {"total": null, "won": null}, "dribbles": {"attempts": null, "success": null, "past": null}, "fouls": {"drawn": null, "committed": null}, "cards": {"yellow": 0, "red": 0}, "penalty": {"won": null, "commited": null, "scored": 0, "missed": 0, "saved": null}}]}'

In [13]:
players_stats.raw[36]

'{"player": {"id": 51051, "name": "Juli\\u00e1n Araujo", "photo": "https://media.api-sports.io/football/players/51051.png"}, "statistics": [{"games": {"minutes": null, "number": 2, "position": "D", "rating": null, "captain": false, "substitute": true}, "offsides": null, "shots": {"total": null, "on": null}, "goals": {"total": null, "conceded": 0, "assists": null, "saves": null}, "passes": {"total": null, "key": null, "accuracy": null}, "tackles": {"total": null, "blocks": null, "interceptions": null}, "duels": {"total": null, "won": null}, "dribbles": {"attempts": null, "success": null, "past": null}, "fouls": {"drawn": null, "committed": null}, "cards": {"yellow": 0, "red": 0}, "penalty": {"won": null, "commited": null, "scored": 0, "missed": 0, "saved": null}}]}'

In [14]:
players_stats.raw[37]

'{"player": {"id": 363333, "name": "Julio Soler", "photo": "https://media.api-sports.io/football/players/363333.png"}, "statistics": [{"games": {"minutes": null, "number": 20, "position": "D", "rating": null, "captain": false, "substitute": true}, "offsides": null, "shots": {"total": null, "on": null}, "goals": {"total": null, "conceded": 0, "assists": null, "saves": null}, "passes": {"total": null, "key": null, "accuracy": null}, "tackles": {"total": null, "blocks": null, "interceptions": null}, "duels": {"total": null, "won": null}, "dribbles": {"attempts": null, "success": null, "past": null}, "fouls": {"drawn": null, "committed": null}, "cards": {"yellow": 0, "red": 0}, "penalty": {"won": null, "commited": null, "scored": 0, "missed": 0, "saved": null}}]}'

In [6]:
# print(processed_players_stats.value_counts(dropna=False))
# processed_players_stats.fillna(0, inplace=True)

# 1) Descriptives
print(processed_players_stats.describe().T)  # quick scale/sanity profile [7]

# 2) Missingness
nulls = processed_players_stats.isna().sum().sort_values(ascending=False)  # per-column NA counts [8]
print(f'nulls: \n{nulls.head(10)}')

# 3) Duplicates on (fixture_id, player_id)
dupes = processed_players_stats.duplicated(subset=['fixture_id','player_id']).sum()  # should be 0 [9]
print("Duplicate (fixture_id, player_id):", dupes)

# 4) Range rules
if 'rating_ema5' in processed_players_stats.columns:
    bad = ~processed_players_stats['rating_ema5'].between(0, 10, inclusive='both')  # typical 0â€“10 [10]
    print("rating_ema5 out of [0,10]:", int(bad.sum()))
if 'minutes_share_ema5' in processed_players_stats.columns:
    bad = ~processed_players_stats['minutes_share_ema5'].between(0, 1.5, inclusive='both')  # allow some ET [10]
    print("minutes_share_ema5 out of [0,1.5]:", int(bad.sum()))

# 5) IQR outliers for EMA columns
ema_cols = [c for c in processed_players_stats.columns if c.endswith('_ema5')]
for c in ema_cols:
    q1, q3 = processed_players_stats[c].quantile(0.25), processed_players_stats[c].quantile(0.75)  # quartiles [11]
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    mask = (processed_players_stats[c] < lo) | (processed_players_stats[c] > hi)
    print(f"{c} IQR outliers:", int(mask.sum()))

# 6) Chronology / leakage: first row per player should have EMA==0 after shift
first_rows = processed_players_stats.sort_values(['player_id','fixture_dt']).groupby('player_id').head(1)
print({c: int(first_rows[c].notna().sum()) for c in ema_cols})  # expect 0 non-NaN before fill [4][5]

                          count            mean            std        min  \
fixture_id              11333.0  1208211.963911     110.667093  1208021.0   
player_id               11333.0     70325.10015  104148.376828        5.0   
team_id                 11333.0        46.54434       9.301055       33.0   
rating_ema5             10785.0        6.936705       0.345074        4.3   
shots_on_per90_ema5     10941.0        0.452372       0.978199        0.0   
shots_total_per90_ema5  10941.0        0.905672       1.401155        0.0   
passes_key_per90_ema5   10941.0         0.93786       1.722703        0.0   
minutes_share_ema5      10941.0          0.6785       0.312656        0.0   
minutes                 11333.0       66.644401      29.938195        6.0   

                              25%        50%        75%        max  
fixture_id              1208117.0  1208212.0  1208308.0  1208402.0  
player_id                  2490.0    19130.0   126949.0   460853.0  
team_id               