# Data Verification Notebook

Verify data collected from API and uploaded to Hugging Face.

**Checks:**
1. Available leagues and seasons
2. Match counts and completeness
3. Data quality (missing values, duplicates)
4. Supporting data (events, lineups, player_stats)
5. Feature engineering readiness

In [1]:
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../data/01-raw')
EXPECTED_MATCHES_PER_SEASON = 380

print(f'Data directory: {DATA_DIR.absolute()}')
print(f'Exists: {DATA_DIR.exists()}')

Data directory: /home/kamil/projects/bettip/notebooks/../data/01-raw
Exists: True


## 1. Overview: Available Leagues and Seasons

In [2]:
def get_data_overview(data_dir: Path) -> pd.DataFrame:
    """Get overview of all available data."""
    rows = []
    
    for league_dir in sorted(data_dir.iterdir()):
        if not league_dir.is_dir():
            continue
        
        for season_dir in sorted(league_dir.iterdir()):
            if not season_dir.is_dir():
                continue
            
            matches_file = season_dir / 'matches.parquet'
            if not matches_file.exists():
                continue
            
            df = pd.read_parquet(matches_file)
            
            # Count completed matches
            status_col = 'fixture.status.short'
            if status_col in df.columns:
                completed = len(df[df[status_col].isin(['FT', 'AET', 'PEN'])])
            else:
                completed = len(df)
            
            # Check supporting files
            has_events = (season_dir / 'events.parquet').exists()
            has_lineups = (season_dir / 'lineups.parquet').exists()
            has_stats = (season_dir / 'player_stats.parquet').exists()
            
            rows.append({
                'League': league_dir.name,
                'Season': season_dir.name,
                'Total Matches': len(df),
                'Completed': completed,
                'Completeness': f'{completed/EXPECTED_MATCHES_PER_SEASON*100:.1f}%',
                'Events': 'Yes' if has_events else 'No',
                'Lineups': 'Yes' if has_lineups else 'No',
                'Stats': 'Yes' if has_stats else 'No'
            })
    
    return pd.DataFrame(rows)

overview = get_data_overview(DATA_DIR)
print(f'Total: {len(overview)} season(s) across {overview["League"].nunique()} league(s)')
print(f'Total matches: {overview["Total Matches"].sum():,}')
print()
display(overview)

Total: 10 season(s) across 3 league(s)
Total matches: 3,800



Unnamed: 0,League,Season,Total Matches,Completed,Completeness,Events,Lineups,Stats
0,la_liga,2023,380,380,100.0%,Yes,Yes,Yes
1,la_liga,2024,380,380,100.0%,Yes,Yes,Yes
2,premier_league,2020,380,380,100.0%,Yes,Yes,Yes
3,premier_league,2021,380,380,100.0%,Yes,Yes,Yes
4,premier_league,2022,380,380,100.0%,Yes,Yes,Yes
5,premier_league,2023,380,380,100.0%,Yes,Yes,Yes
6,premier_league,2024,380,380,100.0%,Yes,Yes,Yes
7,premier_league,2025,380,134,35.3%,Yes,Yes,Yes
8,serie_a,2024,380,380,100.0%,Yes,Yes,Yes
9,serie_a,2025,380,165,43.4%,Yes,Yes,Yes


## 2. Detailed Match Data Quality

In [3]:
def analyze_matches(league: str, season: str) -> dict:
    """Analyze match data quality for a specific season."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        return None
    
    df = pd.read_parquet(path)
    
    is_raw_api = 'fixture.id' in df.columns
    
    if is_raw_api:
        id_col = 'fixture.id'
        date_col = 'fixture.date'
        home_col = 'teams.home.name'
        away_col = 'teams.away.name'
        home_goals = 'goals.home'
        away_goals = 'goals.away'
        status_col = 'fixture.status.short'
    else:
        id_col = 'fixture_id'
        date_col = 'date'
        home_col = 'home_team_name'
        away_col = 'away_team_name'
        home_goals = 'ft_home'
        away_goals = 'ft_away'
        status_col = 'status'
    
    result = {
        'total_matches': len(df),
        'columns': len(df.columns),
        'format': 'Raw API' if is_raw_api else 'Clean',
        'unique_fixtures': df[id_col].nunique() if id_col in df.columns else 'N/A',
        'duplicates': len(df) - df[id_col].nunique() if id_col in df.columns else 0,
    }
    
    if status_col in df.columns:
        result['status_breakdown'] = df[status_col].value_counts().to_dict()
        result['completed'] = len(df[df[status_col].isin(['FT', 'AET', 'PEN'])])
    
    if date_col in df.columns:
        dates = pd.to_datetime(df[date_col])
        result['date_range'] = f"{dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}"
    
    if home_col in df.columns and away_col in df.columns:
        all_teams = set(df[home_col].unique()) | set(df[away_col].unique())
        result['unique_teams'] = len(all_teams)
        result['teams'] = sorted(all_teams)
    
    # Goals
    if home_goals in df.columns and away_goals in df.columns:
        df_completed = df[df[status_col].isin(['FT', 'AET', 'PEN'])] if status_col in df.columns else df
        result['avg_home_goals'] = df_completed[home_goals].mean()
        result['avg_away_goals'] = df_completed[away_goals].mean()
        result['avg_total_goals'] = (df_completed[home_goals] + df_completed[away_goals]).mean()
        result['missing_scores'] = df_completed[[home_goals, away_goals]].isna().any(axis=1).sum()
    
    return result

# Analyze all seasons
for _, row in overview.iterrows():
    print(f"\n{'='*60}")
    print(f"{row['League']} - Season {row['Season']}")
    print('='*60)
    
    analysis = analyze_matches(row['League'], row['Season'])
    if analysis:
        print(f"Total matches: {analysis['total_matches']}")
        print(f"Completed: {analysis.get('completed', 'N/A')}")
        print(f"Format: {analysis['format']}")
        print(f"Duplicates: {analysis['duplicates']}")
        print(f"Date range: {analysis.get('date_range', 'N/A')}")
        print(f"Teams: {analysis.get('unique_teams', 'N/A')}")
        if 'avg_total_goals' in analysis:
            print(f"Avg goals/match: {analysis['avg_total_goals']:.2f}")
        if 'status_breakdown' in analysis:
            print(f"Status: {analysis['status_breakdown']}")


la_liga - Season 2023
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date range: 2023-08-11 to 2024-05-26
Teams: 20
Avg goals/match: 2.64
Status: {'FT': 380}

la_liga - Season 2024
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date range: 2024-08-15 to 2025-05-25
Teams: 20
Avg goals/match: 2.62
Status: {'FT': 380}

premier_league - Season 2020
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date range: 2020-09-12 to 2021-05-23
Teams: 20
Avg goals/match: 2.69
Status: {'FT': 380}

premier_league - Season 2021
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date range: 2021-08-13 to 2022-05-22
Teams: 20
Avg goals/match: 2.82
Status: {'FT': 380}

premier_league - Season 2022
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date range: 2022-08-05 to 2023-05-28
Teams: 20
Avg goals/match: 2.85
Status: {'FT': 380}

premier_league - Season 2023
Total matches: 380
Completed: 380
Format: Raw API
Duplicates: 0
Date 

## 3. Supporting Data Quality (Events, Lineups, Player Stats)

In [11]:
def analyze_supporting_data(league: str, season: str) -> dict:
    """Analyze supporting data files."""
    base_dir = DATA_DIR / league / season
    result = {}
    
    # Events
    events_path = base_dir / 'events.parquet'
    if events_path.exists():
        events = pd.read_parquet(events_path)
        result['events'] = {
            'rows': len(events),
            'columns': list(events.columns)
        }
    
    # Lineups
    lineups_path = base_dir / 'lineups.parquet'
    if lineups_path.exists():
        lineups = pd.read_parquet(lineups_path)
        result['lineups'] = {
            'rows': len(lineups),
            'columns': list(lineups.columns)
        }
    
    # Player stats
    stats_path = base_dir / 'player_stats.parquet'
    if stats_path.exists():
        stats = pd.read_parquet(stats_path)
        result['player_stats'] = {
            'rows': len(stats),
            'columns': list(stats.columns)
        }
    
    return result

# Show supporting data for first available season
if len(overview) > 0:
    sample = overview.iloc[0]
    print(f"Supporting data sample: {sample['League']} {sample['Season']}")
    print('='*60)
    
    support = analyze_supporting_data(sample['League'], sample['Season'])
    for name, info in support.items():
        print(f"\n{name.upper()}:")
        print(f"  Rows: {info['rows']}")
        print(f"  Columns: {info['columns']}")

Supporting data sample: la_liga 2023

EVENTS:
  Rows: 4710
  Columns: ['fixture_id', 'date', 'status', 'home_team', 'away_team', 'score_home', 'score_away', 'type', 'detail', 'comments', 'time.elapsed', 'time.extra', 'team.id', 'team.name', 'team.logo', 'player.id', 'player.name', 'assist.id', 'assist.name']

LINEUPS:
  Rows: 11713
  Columns: ['fixture_id', 'date', 'status', 'home_team', 'away_team', 'score_home', 'score_away', 'team_name', 'type', 'id', 'name', 'number', 'pos', 'grid']

PLAYER_STATS:
  Rows: 11667
  Columns: ['fixture_id', 'date', 'status', 'home_team', 'away_team', 'score_home', 'score_away', 'team_name', 'id', 'name', 'photo', 'offsides', 'games.minutes', 'games.number', 'games.position', 'games.rating', 'games.captain', 'games.substitute', 'shots.total', 'shots.on', 'goals.total', 'goals.conceded', 'goals.assists', 'goals.saves', 'passes.total', 'passes.key', 'passes.accuracy', 'tackles.total', 'tackles.blocks', 'tackles.interceptions', 'duels.total', 'duels.won', 

## 4. Sample Data Preview

In [12]:
def show_sample_matches(league: str, season: str, n: int = 5):
    """Show sample matches from a season."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        print(f"No data for {league}/{season}")
        return
    
    df = pd.read_parquet(path)
    
    # Select display columns based on format
    if 'fixture.id' in df.columns:
        cols = ['fixture.id', 'fixture.date', 'teams.home.name', 'teams.away.name', 
                'goals.home', 'goals.away', 'fixture.status.short']
    else:
        cols = ['fixture_id', 'date', 'home_team_name', 'away_team_name', 
                'ft_home', 'ft_away', 'status']
    
    available_cols = [c for c in cols if c in df.columns]
    
    print(f"\nLast {n} matches from {league} {season}:")
    display(df[available_cols].tail(n))

# Show samples for each season
for _, row in overview.iterrows():
    show_sample_matches(row['League'], row['Season'], n=3)


Last 3 matches from la_liga 2023:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1038331,2024-05-26T14:15:00+00:00,Las Palmas,Alaves,1,1,FT
378,1038323,2024-05-26T14:15:00+00:00,Celta Vigo,Valencia,2,2,FT
379,1038329,2024-05-26T19:00:00+00:00,Sevilla,Barcelona,1,2,FT



Last 3 matches from la_liga 2024:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1208832,2025-05-25T12:00:00+00:00,Girona,Atletico Madrid,0,4,FT
378,1208835,2025-05-25T14:15:00+00:00,Villarreal,Sevilla,4,2,FT
379,1208827,2025-05-25T19:00:00+00:00,Athletic Club,Barcelona,0,3,FT



Last 3 matches from premier_league 2020:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,592873,2021-05-23T15:00:00+00:00,Sheffield Utd,Burnley,1,0,FT
378,592869,2021-05-23T15:00:00+00:00,Leeds,West Brom,3,1,FT
379,592867,2021-05-23T15:00:00+00:00,Aston Villa,Chelsea,2,1,FT



Last 3 matches from premier_league 2021:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,710931,2022-05-22T15:00:00+00:00,Crystal Palace,Manchester United,1,0,FT
378,710927,2022-05-22T15:00:00+00:00,Brentford,Leeds,1,2,FT
379,710935,2022-05-22T15:00:00+00:00,Norwich,Tottenham,0,5,FT



Last 3 matches from premier_league 2022:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,868318,2023-05-28T15:30:00+00:00,Brentford,Manchester City,1,0,FT
378,868322,2023-05-28T15:30:00+00:00,Leeds,Tottenham,1,4,FT
379,868317,2023-05-28T15:30:00+00:00,Aston Villa,Brighton,2,1,FT



Last 3 matches from premier_league 2023:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1035545,2024-05-19T15:00:00+00:00,Brentford,Newcastle,2,4,FT
378,1035553,2024-05-19T15:00:00+00:00,Sheffield Utd,Tottenham,0,3,FT
379,1035551,2024-05-19T15:00:00+00:00,Luton,Fulham,2,4,FT



Last 3 matches from premier_league 2024:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1208401,2025-05-25T15:00:00+00:00,Tottenham,Brighton,1,4,FT
378,1208395,2025-05-25T15:00:00+00:00,Ipswich,West Ham,1,3,FT
379,1208399,2025-05-25T15:00:00+00:00,Nottingham Forest,Chelsea,0,1,FT



Last 3 matches from premier_league 2025:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1379341,2026-05-24T15:00:00+00:00,Crystal Palace,Arsenal,,,NS
378,1379345,2026-05-24T15:00:00+00:00,Nottingham Forest,Bournemouth,,,NS
379,1379346,2026-05-24T15:00:00+00:00,Sunderland,Chelsea,,,NS



Last 3 matches from serie_a 2024:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1223972,2025-05-25T18:45:00+00:00,Torino,AS Roma,0,2,FT
378,1223968,2025-05-25T18:45:00+00:00,Empoli,Verona,1,2,FT
379,1223974,2025-05-25T18:45:00+00:00,Venezia,Juventus,2,3,FT



Last 3 matches from serie_a 2025:


Unnamed: 0,fixture.id,fixture.date,teams.home.name,teams.away.name,goals.home,goals.away,fixture.status.short
377,1378235,2026-05-24T13:00:00+00:00,Cremonese,Como,,,NS
378,1378241,2026-05-24T13:00:00+00:00,Parma,Sassuolo,,,NS
379,1378238,2026-05-24T13:00:00+00:00,Lecce,Genoa,,,NS


## 5. Data Quality Summary

In [13]:
def generate_quality_report(overview_df: pd.DataFrame) -> None:
    """Generate overall data quality report."""
    print("\n" + "="*60)
    print("DATA QUALITY SUMMARY")
    print("="*60)
    
    total_matches = overview_df['Total Matches'].sum()
    total_completed = overview_df['Completed'].sum()
    
    print(f"\nTotal seasons: {len(overview_df)}")
    print(f"Total matches: {total_matches:,}")
    print(f"Total completed: {total_completed:,}")
    print(f"Overall completion: {total_completed/total_matches*100:.1f}%")
    
    # Check for issues
    issues = []
    
    for _, row in overview_df.iterrows():
        if row['Completed'] < EXPECTED_MATCHES_PER_SEASON * 0.9:
            if row['Season'] != '2025':  # Current season expected to be incomplete
                issues.append(f"{row['League']} {row['Season']}: Only {row['Completed']} completed matches")
        
        if row['Events'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing events data")
        if row['Lineups'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing lineups data")
        if row['Stats'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing player stats data")
    
    if issues:
        print(f"\nPotential issues ({len(issues)}):")
        for issue in issues:
            print(f"  - {issue}")
    else:
        print("\nNo issues detected!")
    
    print("\n" + "="*60)

generate_quality_report(overview)


DATA QUALITY SUMMARY

Total seasons: 10
Total matches: 3,800
Total completed: 3,339
Overall completion: 87.9%

No issues detected!



## 6. Test Feature Engineering Pipeline

In [19]:
import sys
sys.path.insert(0, str(Path('..').absolute()))

from src.features.cleaners import MatchDataCleaner
from src.features.engineers import TeamFormFeatureEngineer, MatchOutcomeFeatureEngineer

def test_feature_engineering(league: str, season: str):
    """Test if feature engineering works on the data."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        return False, "Data not found"
    
    try:
        # Load and clean
        df = pd.read_parquet(path)
        cleaner = MatchDataCleaner()
        cleaned = cleaner.clean(df)
        
        # Create features
        data = {'matches': cleaned}
        
        form_eng = TeamFormFeatureEngineer(n_matches=5)
        form_features = form_eng.create_features(data)
        
        outcome_eng = MatchOutcomeFeatureEngineer()
        outcome_features = outcome_eng.create_features(data)
        
        return True, {
            'cleaned_rows': len(cleaned),
            'form_features': list(form_features.columns),
            'outcome_features': list(outcome_features.columns)
        }
    except Exception as e:
        return False, str(e)

print("Testing feature engineering on each season...\n")
for _, row in overview.iterrows():
    success, result = test_feature_engineering(row['League'], row['Season'])
    status = 'PASS' if success else 'FAIL'
    print(f"{row['League']} {row['Season']}: {status}")
    print(f"outcome_features: {result['outcome_features']}")
    if not success:
        print(f"  Error: {result}")
    else:
        print(f"  Cleaned rows: {result['cleaned_rows']}")

Testing feature engineering on each season...

Matches: 380 (with full scores)
Created 380 team form features (last 5 matches)
Created target variables
la_liga 2023: PASS
outcome_features: ['fixture_id', 'match_result', 'home_win', 'draw', 'away_win', 'total_goals', 'goal_difference', 'gd_form_diff']
  Cleaned rows: 380
Matches: 380 (with full scores)
Created 380 team form features (last 5 matches)
Created target variables
la_liga 2024: PASS
outcome_features: ['fixture_id', 'match_result', 'home_win', 'draw', 'away_win', 'total_goals', 'goal_difference', 'gd_form_diff']
  Cleaned rows: 380
Matches: 380 (with full scores)
Created 380 team form features (last 5 matches)
Created target variables
premier_league 2020: PASS
outcome_features: ['fixture_id', 'match_result', 'home_win', 'draw', 'away_win', 'total_goals', 'goal_difference', 'gd_form_diff']
  Cleaned rows: 380
Matches: 380 (with full scores)
Created 380 team form features (last 5 matches)
Created target variables
premier_league 2

## 7. Compare with Hugging Face (Optional)

In [None]:
# Uncomment to download and compare with HF data
# This requires huggingface_hub to be installed

# from huggingface_hub import HfApi
# 
# api = HfApi()
# repo_id = "your-username/your-repo"  # Update with your repo
# 
# files = api.list_repo_files(repo_id)
# print(f"Files in HF repo: {len(files)}")
# for f in files:
#     print(f"  {f}")