# Data Verification Notebook

Verify data collected from API and uploaded to Hugging Face.

**Checks:**
1. Available leagues and seasons
2. Match counts and completeness
3. Data quality (missing values, duplicates)
4. Supporting data (events, lineups, player_stats)
5. Feature engineering readiness

In [None]:
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../data/01-raw')
EXPECTED_MATCHES_PER_SEASON = 380  # Premier League

print(f'Data directory: {DATA_DIR.absolute()}')
print(f'Exists: {DATA_DIR.exists()}')

## 1. Overview: Available Leagues and Seasons

In [None]:
def get_data_overview(data_dir: Path) -> pd.DataFrame:
    """Get overview of all available data."""
    rows = []
    
    for league_dir in sorted(data_dir.iterdir()):
        if not league_dir.is_dir():
            continue
        
        for season_dir in sorted(league_dir.iterdir()):
            if not season_dir.is_dir():
                continue
            
            matches_file = season_dir / 'matches.parquet'
            if not matches_file.exists():
                continue
            
            df = pd.read_parquet(matches_file)
            
            # Count completed matches
            status_col = 'fixture.status.short'
            if status_col in df.columns:
                completed = len(df[df[status_col].isin(['FT', 'AET', 'PEN'])])
            else:
                completed = len(df)
            
            # Check supporting files
            has_events = (season_dir / 'events.parquet').exists()
            has_lineups = (season_dir / 'lineups.parquet').exists()
            has_stats = (season_dir / 'player_stats.parquet').exists()
            
            rows.append({
                'League': league_dir.name,
                'Season': season_dir.name,
                'Total Matches': len(df),
                'Completed': completed,
                'Completeness': f'{completed/EXPECTED_MATCHES_PER_SEASON*100:.1f}%',
                'Events': 'Yes' if has_events else 'No',
                'Lineups': 'Yes' if has_lineups else 'No',
                'Stats': 'Yes' if has_stats else 'No'
            })
    
    return pd.DataFrame(rows)

overview = get_data_overview(DATA_DIR)
print(f'Total: {len(overview)} season(s) across {overview["League"].nunique()} league(s)')
print(f'Total matches: {overview["Total Matches"].sum():,}')
print()
display(overview)

## 2. Detailed Match Data Quality

In [None]:
def analyze_matches(league: str, season: str) -> dict:
    """Analyze match data quality for a specific season."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        return None
    
    df = pd.read_parquet(path)
    
    is_raw_api = 'fixture.id' in df.columns
    
    if is_raw_api:
        id_col = 'fixture.id'
        date_col = 'fixture.date'
        home_col = 'teams.home.name'
        away_col = 'teams.away.name'
        home_goals = 'goals.home'
        away_goals = 'goals.away'
        status_col = 'fixture.status.short'
    else:
        id_col = 'fixture_id'
        date_col = 'date'
        home_col = 'home_team_name'
        away_col = 'away_team_name'
        home_goals = 'ft_home'
        away_goals = 'ft_away'
        status_col = 'status'
    
    result = {
        'total_matches': len(df),
        'columns': len(df.columns),
        'format': 'Raw API' if is_raw_api else 'Clean',
        'unique_fixtures': df[id_col].nunique() if id_col in df.columns else 'N/A',
        'duplicates': len(df) - df[id_col].nunique() if id_col in df.columns else 0,
    }
    
    if status_col in df.columns:
        result['status_breakdown'] = df[status_col].value_counts().to_dict()
        result['completed'] = len(df[df[status_col].isin(['FT', 'AET', 'PEN'])])
    
    if date_col in df.columns:
        dates = pd.to_datetime(df[date_col])
        result['date_range'] = f"{dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}"
    
    if home_col in df.columns and away_col in df.columns:
        all_teams = set(df[home_col].unique()) | set(df[away_col].unique())
        result['unique_teams'] = len(all_teams)
        result['teams'] = sorted(all_teams)
    
    # Goals
    if home_goals in df.columns and away_goals in df.columns:
        df_completed = df[df[status_col].isin(['FT', 'AET', 'PEN'])] if status_col in df.columns else df
        result['avg_home_goals'] = df_completed[home_goals].mean()
        result['avg_away_goals'] = df_completed[away_goals].mean()
        result['avg_total_goals'] = (df_completed[home_goals] + df_completed[away_goals]).mean()
        result['missing_scores'] = df_completed[[home_goals, away_goals]].isna().any(axis=1).sum()
    
    return result

# Analyze all seasons
for _, row in overview.iterrows():
    print(f"\n{'='*60}")
    print(f"{row['League']} - Season {row['Season']}")
    print('='*60)
    
    analysis = analyze_matches(row['League'], row['Season'])
    if analysis:
        print(f"Total matches: {analysis['total_matches']}")
        print(f"Completed: {analysis.get('completed', 'N/A')}")
        print(f"Format: {analysis['format']}")
        print(f"Duplicates: {analysis['duplicates']}")
        print(f"Date range: {analysis.get('date_range', 'N/A')}")
        print(f"Teams: {analysis.get('unique_teams', 'N/A')}")
        if 'avg_total_goals' in analysis:
            print(f"Avg goals/match: {analysis['avg_total_goals']:.2f}")
        if 'status_breakdown' in analysis:
            print(f"Status: {analysis['status_breakdown']}")

## 3. Supporting Data Quality (Events, Lineups, Player Stats)

In [None]:
def analyze_supporting_data(league: str, season: str) -> dict:
    """Analyze supporting data files."""
    base_dir = DATA_DIR / league / season
    result = {}
    
    # Events
    events_path = base_dir / 'events.parquet'
    if events_path.exists():
        events = pd.read_parquet(events_path)
        result['events'] = {
            'rows': len(events),
            'columns': list(events.columns)
        }
    
    # Lineups
    lineups_path = base_dir / 'lineups.parquet'
    if lineups_path.exists():
        lineups = pd.read_parquet(lineups_path)
        result['lineups'] = {
            'rows': len(lineups),
            'columns': list(lineups.columns)
        }
    
    # Player stats
    stats_path = base_dir / 'player_stats.parquet'
    if stats_path.exists():
        stats = pd.read_parquet(stats_path)
        result['player_stats'] = {
            'rows': len(stats),
            'columns': list(stats.columns)
        }
    
    return result

# Show supporting data for first available season
if len(overview) > 0:
    sample = overview.iloc[0]
    print(f"Supporting data sample: {sample['League']} {sample['Season']}")
    print('='*60)
    
    support = analyze_supporting_data(sample['League'], sample['Season'])
    for name, info in support.items():
        print(f"\n{name.upper()}:")
        print(f"  Rows: {info['rows']}")
        print(f"  Columns: {info['columns']}")

## 4. Sample Data Preview

In [None]:
def show_sample_matches(league: str, season: str, n: int = 5):
    """Show sample matches from a season."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        print(f"No data for {league}/{season}")
        return
    
    df = pd.read_parquet(path)
    
    # Select display columns based on format
    if 'fixture.id' in df.columns:
        cols = ['fixture.id', 'fixture.date', 'teams.home.name', 'teams.away.name', 
                'goals.home', 'goals.away', 'fixture.status.short']
    else:
        cols = ['fixture_id', 'date', 'home_team_name', 'away_team_name', 
                'ft_home', 'ft_away', 'status']
    
    available_cols = [c for c in cols if c in df.columns]
    
    print(f"\nLast {n} matches from {league} {season}:")
    display(df[available_cols].tail(n))

# Show samples for each season
for _, row in overview.iterrows():
    show_sample_matches(row['League'], row['Season'], n=3)

## 5. Data Quality Summary

In [None]:
def generate_quality_report(overview_df: pd.DataFrame) -> None:
    """Generate overall data quality report."""
    print("\n" + "="*60)
    print("DATA QUALITY SUMMARY")
    print("="*60)
    
    total_matches = overview_df['Total Matches'].sum()
    total_completed = overview_df['Completed'].sum()
    
    print(f"\nTotal seasons: {len(overview_df)}")
    print(f"Total matches: {total_matches:,}")
    print(f"Total completed: {total_completed:,}")
    print(f"Overall completion: {total_completed/total_matches*100:.1f}%")
    
    # Check for issues
    issues = []
    
    for _, row in overview_df.iterrows():
        if row['Completed'] < EXPECTED_MATCHES_PER_SEASON * 0.9:
            if row['Season'] != '2025':  # Current season expected to be incomplete
                issues.append(f"{row['League']} {row['Season']}: Only {row['Completed']} completed matches")
        
        if row['Events'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing events data")
        if row['Lineups'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing lineups data")
        if row['Stats'] == 'No':
            issues.append(f"{row['League']} {row['Season']}: Missing player stats data")
    
    if issues:
        print(f"\nPotential issues ({len(issues)}):")
        for issue in issues:
            print(f"  - {issue}")
    else:
        print("\nNo issues detected!")
    
    print("\n" + "="*60)

generate_quality_report(overview)

## 6. Test Feature Engineering Pipeline

In [None]:
import sys
sys.path.insert(0, str(Path('..').absolute()))

from src.features.cleaners import MatchDataCleaner
from src.features.engineers import TeamFormFeatureEngineer, MatchOutcomeFeatureEngineer

def test_feature_engineering(league: str, season: str):
    """Test if feature engineering works on the data."""
    path = DATA_DIR / league / season / 'matches.parquet'
    if not path.exists():
        return False, "Data not found"
    
    try:
        # Load and clean
        df = pd.read_parquet(path)
        cleaner = MatchDataCleaner()
        cleaned = cleaner.clean(df)
        
        # Create features
        data = {'matches': cleaned}
        
        form_eng = TeamFormFeatureEngineer(n_matches=5)
        form_features = form_eng.create_features(data)
        
        outcome_eng = MatchOutcomeFeatureEngineer()
        outcome_features = outcome_eng.create_features(data)
        
        return True, {
            'cleaned_rows': len(cleaned),
            'form_features': list(form_features.columns),
            'outcome_features': list(outcome_features.columns)
        }
    except Exception as e:
        return False, str(e)

print("Testing feature engineering on each season...\n")
for _, row in overview.iterrows():
    success, result = test_feature_engineering(row['League'], row['Season'])
    status = 'PASS' if success else 'FAIL'
    print(f"{row['League']} {row['Season']}: {status}")
    if not success:
        print(f"  Error: {result}")
    else:
        print(f"  Cleaned rows: {result['cleaned_rows']}")

## 7. Compare with Hugging Face (Optional)

In [None]:
# Uncomment to download and compare with HF data
# This requires huggingface_hub to be installed

# from huggingface_hub import HfApi
# 
# api = HfApi()
# repo_id = "your-username/your-repo"  # Update with your repo
# 
# files = api.list_repo_files(repo_id)
# print(f"Files in HF repo: {len(files)}")
# for f in files:
#     print(f"  {f}")