In [1]:
import argparse
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sys

In [2]:
def load_parquet_optional(p: Path):
    if p.exists():
        try:
            return pd.read_parquet(p)
        except Exception as e:
            print(f"Failed to read {p}: {e}")
            return None
    return None


def sanitize_numeric_series(s):
    return pd.to_numeric(s, errors='coerce')


# small plotting helper
def save_plot(fig, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, bbox_inches='tight')
    plt.close(fig)

In [4]:
# when pasted into a notebook, set SEASON variable manually
SEASON = 2020

PROJECT_ROOT = PROJECT_ROOT = Path.cwd().parents[1]
BASE_DIR = PROJECT_ROOT / "data" / "seasons" / str(SEASON)

SAVE_REPORT = False
OUT_DIR = BASE_DIR / 'eda_report'

OUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR = OUT_DIR / 'plots'
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Season folder: {BASE_DIR.resolve()}")

# %%
# Load normalized tables
matches_fp = BASE_DIR / 'matches.parquet'
players_fp = BASE_DIR / 'player_stats.parquet'
events_fp = BASE_DIR / 'events.parquet'
teams_fp = BASE_DIR / 'teams.parquet'

matches = load_parquet_optional(matches_fp)
players = load_parquet_optional(players_fp)
events = load_parquet_optional(events_fp)
teams = load_parquet_optional(teams_fp)

print('Loaded:')
print(' matches:', getattr(matches, 'shape', None))
print(' players:', getattr(players, 'shape', None))
print(' events :', getattr(events, 'shape', None))
print(' teams :', getattr(teams, 'shape', None))


Season folder: /home/kamil/projects/tipster/data/seasons/2020
Loaded:
 matches: (380, 17)
 players: (14643, 10)
 events : (11132, 9)
 teams : (20, 2)
