In [None]:
# Premier League xG Data Collection (2015-2025)
# This notebook collects Expected Goals (xG) data from Understat for 10 seasons
# to match the timeframe of our Premier League matches dataset

%pip install understatapi pandas --quiet



In [1]:
import pandas as pd
import understatapi
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up paths to match project structure
PROCESSED_DATA_PATH = Path("../data/processed/")
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

print("Libraries imported successfully!")
print(f"Data will be saved to: {PROCESSED_DATA_PATH}")

# Initialize Understat client
client = understatapi.UnderstatClient()
print("Understat client initialized!")

Libraries imported successfully!
Data will be saved to: ../data/processed
Understat client initialized!


In [2]:

def to_df(obj):
    """
    Convert various API response shapes to pandas DataFrame
    Handles dictionaries, lists, and nested structures from Understat API
    """
    if isinstance(obj, pd.DataFrame):
        return obj
    if isinstance(obj, list):
        if not obj:
            return pd.DataFrame()
        if isinstance(obj[0], pd.DataFrame):
            return pd.concat(obj, ignore_index=True)
        if isinstance(obj[0], dict):
            return pd.json_normalize(obj)  # list of dicts
        # fallback: make a 1-col frame
        return pd.DataFrame({"value": obj})
    if isinstance(obj, dict):
        # try typical row containers
        for key in ("data", "matches", "fixtures", "results", "items"):
            if key in obj:
                return to_df(obj[key])
        return pd.json_normalize(obj)
    # last resort
    return pd.DataFrame(obj)

def standardize_team_names(df):
    """
    Standardize team names to match those used in Premier League matches dataset
    """
    team_name_mapping = {
        'Manchester Utd': 'Manchester United',
        'Manchester City': 'Manchester City',  # Already correct
        'Tottenham': 'Tottenham',  # Already correct  
        'Leicester': 'Leicester City',
        'Wolverhampton Wanderers': 'Wolverhampton Wanderers',  # Already correct
        'Brighton': 'Brighton & Hove Albion',
        'West Bromwich Albion': 'West Bromwich Albion',  # Already correct
        'Stoke City': 'Stoke City',  # Already correct
        'Swansea': 'Swansea City',
        'Hull City': 'Hull City',  # Already correct
        'Cardiff': 'Cardiff City',
        'Norwich': 'Norwich City',
        'Sheffield Utd': 'Sheffield Utd',  # Already correct
        'Newcastle': 'Newcastle United',
        'West Ham': 'West Ham United'
    }
    
    df_clean = df.copy()
    
    # Apply mapping to team columns
    if 'Home' in df_clean.columns:
        df_clean['Home'] = df_clean['Home'].replace(team_name_mapping)
    if 'Away' in df_clean.columns:
        df_clean['Away'] = df_clean['Away'].replace(team_name_mapping)
    
    return df_clean

# Define seasons to match Premier League dataset (2015-2025)
# Note: Understat seasons are named by starting year, so 2015 = 2015-16 season
seasons = [str(y) for y in range(2015, 2025)]  # Updated to cover 2015-2025
print(f"Collecting xG data for seasons: {seasons}")

# Collect xG data for all seasons
print("Collecting xG data from Understat...")
frames = []
successful_seasons = []
failed_seasons = []

for s in seasons:
    try:
        print(f"Fetching data for season {s}-{int(s)+1}...")
        raw = client.league(league="EPL").get_match_data(season=s)
        df = to_df(raw)
        
        if df is None or df.empty:
            print(f"  ⚠ No data found for season {s}")
            failed_seasons.append(s)
            continue
            
        df["season"] = s
        df["season_label"] = f"{s}-{int(s)+1}"  # Add readable season label
        frames.append(df)
        successful_seasons.append(s)
        print(f"  ✓ Collected {len(df)} matches for season {s}-{int(s)+1}")
        
    except Exception as e:
        print(f"  ✗ Error fetching season {s}: {str(e)}")
        failed_seasons.append(s)

# Combine all seasons
if frames:
    league_data = pd.concat(frames, ignore_index=True)
    print(f"\n{'='*60}")
    print("DATA COLLECTION SUMMARY:")
    print(f"  Successful seasons: {len(successful_seasons)} ({successful_seasons})")
    print(f"  Failed seasons: {len(failed_seasons)} ({failed_seasons})")
    print(f"  Total matches collected: {len(league_data)}")
    print(f"  Date range: {league_data['datetime'].min()} to {league_data['datetime'].max()}")
else:
    league_data = pd.DataFrame()
    print("❌ No data collected!")

print(f"Available columns: {list(league_data.columns)}" if not league_data.empty else "No data to show columns")

Collecting xG data for seasons: ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
Collecting xG data from Understat...
Fetching data for season 2015-2016...
  ✓ Collected 380 matches for season 2015-2016
Fetching data for season 2016-2017...
  ✓ Collected 380 matches for season 2016-2017
Fetching data for season 2017-2018...
  ✓ Collected 380 matches for season 2017-2018
Fetching data for season 2018-2019...
  ✓ Collected 380 matches for season 2018-2019
Fetching data for season 2019-2020...
  ✓ Collected 380 matches for season 2019-2020
Fetching data for season 2020-2021...
  ✓ Collected 380 matches for season 2020-2021
Fetching data for season 2021-2022...
  ✓ Collected 380 matches for season 2021-2022
Fetching data for season 2022-2023...
  ✓ Collected 380 matches for season 2022-2023
Fetching data for season 2023-2024...
  ✓ Collected 380 matches for season 2023-2024
Fetching data for season 2024-2025...
  ✓ Collected 380 matches for season 2024-2025


In [3]:
print(league_data.columns.tolist())  # all column names      # if datetime is index

['id', 'isResult', 'datetime', 'h.id', 'h.title', 'h.short_title', 'a.id', 'a.title', 'a.short_title', 'goals.h', 'goals.a', 'xG.h', 'xG.a', 'forecast.w', 'forecast.d', 'forecast.l', 'season', 'season_label']


In [5]:
# Data Processing and Cleaning
if not league_data.empty:
    print("Processing and cleaning xG data...")
    
    # Rename columns for clarity and consistency
    league_data = league_data.rename(columns={
        'h.title': 'HomeTeam',
        'h.short_title': 'Home_Abbreviation', 
        'a.title': 'AwayTeam',
        'a.short_title': 'Away_Abbreviation', 
        'goals.h': 'Home_Goals', 
        'goals.a': 'Away_Goals', 
        'xG.h': 'Home_xG', 
        'xG.a': 'Away_xG',
        'datetime': 'Date'
    })
    
    # Convert numeric columns to proper data types
    numeric_columns = ['Home_xG', 'Away_xG', 'Home_Goals', 'Away_Goals']
    for col in numeric_columns:
        if col in league_data.columns:
            league_data[col] = pd.to_numeric(league_data[col], errors='coerce')
    
    # Check for any conversion issues
    print(f"Data types after conversion:")
    for col in numeric_columns:
        if col in league_data.columns:
            print(f"  {col}: {league_data[col].dtype} (nulls: {league_data[col].isnull().sum()})")
    
    # Standardize team names to match Premier League dataset
    league_data = standardize_team_names(league_data.rename(columns={'HomeTeam': 'Home', 'AwayTeam': 'Away'}))
    league_data = league_data.rename(columns={'Home': 'HomeTeam', 'Away': 'AwayTeam'})
    
    # Convert date to proper datetime format
    league_data['Date'] = pd.to_datetime(league_data['Date'])
    
    # Add calculated columns (now that we have numeric data)
    league_data['Total_xG'] = league_data['Home_xG'] + league_data['Away_xG']
    league_data['xG_Difference'] = league_data['Home_xG'] - league_data['Away_xG']
    
    # Add season in format matching Premier League data (YYYY-YYYY)
    league_data['Season'] = league_data['season'].apply(lambda x: f"{x}-{int(x)+1}")
    
    # Select final columns for export
    export_columns = [
        'Date', 'id', 'Season', 'HomeTeam', 'AwayTeam', 
        'Home_xG', 'Away_xG', 'Total_xG', 'xG_Difference',
        'Home_Goals', 'Away_Goals', 'isResult',
        'forecast.w', 'forecast.d', 'forecast.l'
    ]
    
    # Filter to include only columns that exist
    available_export_columns = [col for col in export_columns if col in league_data.columns]
    xg_data_final = league_data[available_export_columns].copy()
    
    print(f"✓ Data processing completed!")
    print(f"  Final shape: {xg_data_final.shape}")
    print(f"  Date range: {xg_data_final['Date'].min().strftime('%Y-%m-%d')} to {xg_data_final['Date'].max().strftime('%Y-%m-%d')}")
    print(f"  Seasons: {sorted(xg_data_final['Season'].unique())}")
    print(f"  Unique home teams: {xg_data_final['HomeTeam'].nunique()}")
    print(f"  Unique away teams: {xg_data_final['AwayTeam'].nunique()}")
    
    xg_data_final.head(10)
else:
    print("❌ No data to process!")
    xg_data_final = pd.DataFrame()

Processing and cleaning xG data...
Data types after conversion:
  Home_xG: float64 (nulls: 0)
  Away_xG: float64 (nulls: 0)
  Home_Goals: int64 (nulls: 0)
  Away_Goals: int64 (nulls: 0)
✓ Data processing completed!
  Final shape: (3800, 15)
  Date range: 2015-08-08 to 2025-05-25
  Seasons: ['2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']
  Unique home teams: 34
  Unique away teams: 34


In [6]:
# Export xG data to processed directory
if not xg_data_final.empty:
    # Define output filename
    output_filename = "PL_xG_10years_understat.csv"
    output_path = PROCESSED_DATA_PATH / output_filename
    
    print("Exporting xG data...")
    print(f"Output file: {output_path}")
    
    # Export to CSV
    xg_data_final.to_csv(output_path, index=False)
    
    # Verify the export
    if output_path.exists():
        file_size = output_path.stat().st_size / (1024 * 1024)  # Size in MB
        print(f"✓ Export successful!")
        print(f"  File size: {file_size:.2f} MB")
        
        # Quick verification - reload and check
        verification_df = pd.read_csv(output_path)
        print(f"  Verification: {verification_df.shape[0]} rows, {verification_df.shape[1]} columns")
        
        if verification_df.shape == xg_data_final.shape:
            print("✓ Verification passed - file integrity confirmed")
        else:
            print("⚠ Verification failed - shape mismatch")
    else:
        print("✗ Export failed!")
    
    print(f"\n{'='*60}")
    print("xG DATA EXPORT COMPLETE!")
    print(f"{'='*60}")
    print(f"Final dataset: {output_filename}")
    print(f"Location: {PROCESSED_DATA_PATH}")
    print(f"Shape: {xg_data_final.shape}")
    print(f"Date range: {xg_data_final['Date'].min().strftime('%Y-%m-%d')} to {xg_data_final['Date'].max().strftime('%Y-%m-%d')}")
    print(f"Seasons covered: {len(xg_data_final['Season'].unique())}")
    print(f"Total matches: {len(xg_data_final)}")
    print(f"Unique teams: {len(set(xg_data_final['HomeTeam'].unique()) | set(xg_data_final['AwayTeam'].unique()))}")
    
else:
    print("❌ No data to export!")


Exporting xG data...
Output file: ../data/processed/PL_xG_10years_understat.csv
✓ Export successful!
  File size: 0.48 MB
  Verification: 3800 rows, 15 columns
✓ Verification passed - file integrity confirmed

xG DATA EXPORT COMPLETE!
Final dataset: PL_xG_10years_understat.csv
Location: ../data/processed
Shape: (3800, 15)
Date range: 2015-08-08 to 2025-05-25
Seasons covered: 10
Total matches: 3800
Unique teams: 34


In [7]:
# Data Quality Analysis
if not xg_data_final.empty:
    print("FINAL DATA QUALITY ANALYSIS")
    print("="*50)
    
    # Missing values analysis
    missing_values = xg_data_final.isnull().sum()
    total_missing = missing_values.sum()
    
    if total_missing > 0:
        print(f"⚠ Missing values found:")
        missing_summary = missing_values[missing_values > 0]
        for col, count in missing_summary.items():
            percentage = (count / len(xg_data_final)) * 100
            print(f"  {col}: {count} ({percentage:.1f}%)")
    else:
        print("✓ No missing values in final dataset")
    
    # Data range validation
    print(f"\nData Range Validation:")
    print(f"  xG values: {xg_data_final['Home_xG'].min():.2f} to {xg_data_final['Home_xG'].max():.2f} (Home)")
    print(f"  xG values: {xg_data_final['Away_xG'].min():.2f} to {xg_data_final['Away_xG'].max():.2f} (Away)")
    print(f"  Total xG per match: {xg_data_final['Total_xG'].min():.2f} to {xg_data_final['Total_xG'].max():.2f}")
    
    # Season coverage
    print(f"\nSeason Coverage:")
    season_counts = xg_data_final['Season'].value_counts().sort_index()
    for season, count in season_counts.items():
        print(f"  {season}: {count} matches")
    
    # Team coverage
    all_teams = set(xg_data_final['HomeTeam'].unique()) | set(xg_data_final['AwayTeam'].unique())
    print(f"\nTeam Coverage:")
    print(f"  Total unique teams: {len(all_teams)}")
    print(f"  Teams: {sorted(all_teams)}")
    
    print(f"\n✓ Data quality analysis completed!")
    
    # Display sample of final data
    print(f"\nSample of processed xG data:")
    display_columns = ['Date', 'Season', 'HomeTeam', 'AwayTeam', 'Home_xG', 'Away_xG', 'Total_xG']
    available_display_columns = [col for col in display_columns if col in xg_data_final.columns]
    xg_data_final[available_display_columns].head(10)
else:
    print("❌ No data available for quality analysis!")

FINAL DATA QUALITY ANALYSIS
✓ No missing values in final dataset

Data Range Validation:
  xG values: 0.00 to 6.67 (Home)
  xG values: 0.00 to 5.83 (Away)
  Total xG per match: 0.28 to 7.74

Season Coverage:
  2015-2016: 380 matches
  2016-2017: 380 matches
  2017-2018: 380 matches
  2018-2019: 380 matches
  2019-2020: 380 matches
  2020-2021: 380 matches
  2021-2022: 380 matches
  2022-2023: 380 matches
  2023-2024: 380 matches
  2024-2025: 380 matches

Team Coverage:
  Total unique teams: 34
  Teams: ['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton & Hove Albion', 'Burnley', 'Cardiff City', 'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Huddersfield', 'Hull', 'Ipswich', 'Leeds', 'Leicester City', 'Liverpool', 'Luton', 'Manchester City', 'Manchester United', 'Middlesbrough', 'Newcastle United', 'Norwich City', 'Nottingham Forest', 'Sheffield United', 'Southampton', 'Stoke', 'Sunderland', 'Swansea City', 'Tottenham', 'Watford', 'West Bromwich Albion', 'West Ham Unit

In [8]:
# Final summary and completion message
print("="*60)
print("xG DATA COLLECTION AND EXPORT COMPLETED!")
print("="*60)
print()
print("This notebook has successfully:")
print("1. ✓ Collected xG data from Understat for Premier League seasons 2015-2025")
print("2. ✓ Processed and cleaned the data with standardized team names")
print("3. ✓ Added calculated xG metrics for enhanced analysis")
print("4. ✓ Exported final dataset to data/processed/PL_xG_10years_understat.csv")
print()
print("The xG dataset is now ready for integration with:")
print("- Premier League matches data (PL_matches_10years_cleaned.csv)")
print("- Club ELO ratings data")
print()
print("Next steps: Use this xG data in your DATA1002 analysis!")
print("="*60)

xG DATA COLLECTION AND EXPORT COMPLETED!

This notebook has successfully:
1. ✓ Collected xG data from Understat for Premier League seasons 2015-2025
2. ✓ Processed and cleaned the data with standardized team names
3. ✓ Added calculated xG metrics for enhanced analysis
4. ✓ Exported final dataset to data/processed/PL_xG_10years_understat.csv

The xG dataset is now ready for integration with:
- Premier League matches data (PL_matches_10years_cleaned.csv)
- Club ELO ratings data

Next steps: Use this xG data in your DATA1002 analysis!
