# Data Verification Notebook
Verify parquet files in data/01-raw are correct

In [None]:
import pandas as pd
from pathlib import Path

RAW_DIR = Path('01-raw/premier_league')
print(f'Raw data directory: {RAW_DIR}')
print(f'Available seasons: {sorted([d.name for d in RAW_DIR.iterdir() if d.is_dir()])}')

## Season 2025

In [None]:
matches_2025 = pd.read_parquet(RAW_DIR / '2025' / 'matches.parquet')
print(f'Total matches 2025: {len(matches_2025)}')
print(f'Columns: {len(matches_2025.columns)}')
print(f'\nColumn names:')
print(matches_2025.columns.tolist()[:20])

In [None]:
cols = ['fixture.id', 'fixture.date', 'teams.home.name', 'teams.away.name',
        'goals.home', 'goals.away', 'fixture.status.short']
available_cols = [c for c in cols if c in matches_2025.columns]
print('Last 5 matches (2025):')
matches_2025[available_cols].tail(5)

In [None]:
print('Match status breakdown (2025):')
if 'fixture.status.short' in matches_2025.columns:
    print(matches_2025['fixture.status.short'].value_counts())

## Season 2024

In [None]:
matches_2024 = pd.read_parquet(RAW_DIR / '2024' / 'matches.parquet')
print(f'Total matches 2024: {len(matches_2024)}')
print(f'Columns: {len(matches_2024.columns)}')

In [None]:
print('Last 5 matches (2024):')
matches_2024[available_cols].tail(5)

In [None]:
print('Match status breakdown (2024):')
if 'fixture.status.short' in matches_2024.columns:
    print(matches_2024['fixture.status.short'].value_counts())

## Check Other Parquet Files

In [None]:
print('Parquet files in 2025:')
for f in sorted((RAW_DIR / '2025').glob('*.parquet')):
    df = pd.read_parquet(f)
    print(f'  {f.name}: {len(df)} rows, {len(df.columns)} cols')

In [None]:
events_file = RAW_DIR / '2025' / 'events.parquet'
if events_file.exists():
    events = pd.read_parquet(events_file)
    print(f'Events columns: {events.columns.tolist()[:15]}')
    print(f'\nSample events:')
    display(events.head(3))

In [None]:
lineups_file = RAW_DIR / '2025' / 'lineups.parquet'
if lineups_file.exists():
    lineups = pd.read_parquet(lineups_file)
    print(f'Lineups columns: {lineups.columns.tolist()[:15]}')
    print(f'\nSample lineups:')
    display(lineups.head(3))

In [None]:
stats_file = RAW_DIR / '2025' / 'player_stats.parquet'
if stats_file.exists():
    stats = pd.read_parquet(stats_file)
    print(f'Player stats columns: {stats.columns.tolist()[:15]}')
    print(f'\nSample player stats:')
    display(stats.head(3))