# 03 - Feature Engineering

This notebook computes rolling statistics, form metrics, and head-to-head features for backtesting.

## Features to Compute
1. Team form (last 5/10 games): W-D-L, points, goals
2. Rolling stats: possession, shots, corners averages
3. Home/away specific metrics
4. Head-to-head history
5. Match outcome labels (ground truth)



In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
from typing import Optional, Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../data')
PROCESSED_DIR = DATA_DIR / 'processed'

print("Loading cleaned data...")



Loading cleaned data...


In [11]:
# Load cleaned matches
matches = pd.read_parquet(PROCESSED_DIR / 'matches_base.parquet')
matches['date'] = pd.to_datetime(matches['date'])
matches = matches.sort_values('date').reset_index(drop=True)

print(f"âœ… Loaded {len(matches):,} matches")
print(f"   Date range: {matches['date'].min()} to {matches['date'].max()}")



âœ… Loaded 59,550 matches
   Date range: 2024-01-01 05:00:00+00:00 to 2025-12-15 20:45:00+00:00


## 1. Build Team Match History

Create a unified view of all matches from each team's perspective for rolling calculations.



In [12]:
def build_team_history(matches_df: pd.DataFrame) -> pd.DataFrame:
    """
    Build a unified team history DataFrame where each row represents
    one team's participation in a match.
    """
    # Home team perspective
    home = matches_df[['eventId', 'date', 'leagueId', 'tier',
                       'homeTeamId', 'awayTeamId', 
                       'homeTeamScore', 'awayTeamScore',
                       'home_possessionPct', 'home_totalShots', 'home_shotsOnTarget',
                       'home_wonCorners', 'home_foulsCommitted', 
                       'home_yellowCards', 'home_redCards']].copy()
    
    home.columns = ['eventId', 'date', 'leagueId', 'tier',
                    'teamId', 'opponentId',
                    'goals_for', 'goals_against',
                    'possession', 'shots', 'shots_on_target',
                    'corners', 'fouls', 'yellow_cards', 'red_cards']
    home['is_home'] = True
    
    # Away team perspective
    away = matches_df[['eventId', 'date', 'leagueId', 'tier',
                       'awayTeamId', 'homeTeamId',
                       'awayTeamScore', 'homeTeamScore',
                       'away_possessionPct', 'away_totalShots', 'away_shotsOnTarget',
                       'away_wonCorners', 'away_foulsCommitted',
                       'away_yellowCards', 'away_redCards']].copy()
    
    away.columns = ['eventId', 'date', 'leagueId', 'tier',
                    'teamId', 'opponentId',
                    'goals_for', 'goals_against',
                    'possession', 'shots', 'shots_on_target',
                    'corners', 'fouls', 'yellow_cards', 'red_cards']
    away['is_home'] = False
    
    # Combine and sort
    history = pd.concat([home, away], ignore_index=True)
    history = history.sort_values(['teamId', 'date']).reset_index(drop=True)
    
    # Compute result
    history['result'] = np.where(
        history['goals_for'] > history['goals_against'], 'W',
        np.where(history['goals_for'] < history['goals_against'], 'L', 'D')
    )
    history['points'] = history['result'].map({'W': 3, 'D': 1, 'L': 0})
    history['clean_sheet'] = (history['goals_against'] == 0).astype(int)
    history['failed_to_score'] = (history['goals_for'] == 0).astype(int)
    
    return history

team_history = build_team_history(matches)
print(f"âœ… Built team history: {len(team_history):,} records")
print(f"   Unique teams: {team_history['teamId'].nunique():,}")



âœ… Built team history: 119,100 records
   Unique teams: 4,049


## 2. Compute Rolling Form Features



In [13]:
def compute_rolling_form(team_history: pd.DataFrame, n_games: int = 5) -> pd.DataFrame:
    """
    Compute rolling form metrics for each team before each match.
    Uses shift(1) to ensure we only use data from BEFORE the current match.
    """
    # Sort by team and date
    df = team_history.sort_values(['teamId', 'date']).copy()
    
    # Group by team
    grouped = df.groupby('teamId')
    
    # Rolling metrics (shifted to exclude current match)
    df[f'form_wins_{n_games}'] = grouped['result'].transform(
        lambda x: (x == 'W').shift(1).rolling(n_games, min_periods=1).sum()
    )
    df[f'form_draws_{n_games}'] = grouped['result'].transform(
        lambda x: (x == 'D').shift(1).rolling(n_games, min_periods=1).sum()
    )
    df[f'form_losses_{n_games}'] = grouped['result'].transform(
        lambda x: (x == 'L').shift(1).rolling(n_games, min_periods=1).sum()
    )
    df[f'form_points_{n_games}'] = grouped['points'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).sum()
    )
    df[f'form_goals_scored_{n_games}'] = grouped['goals_for'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).mean()
    )
    df[f'form_goals_conceded_{n_games}'] = grouped['goals_against'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).mean()
    )
    df[f'form_clean_sheets_{n_games}'] = grouped['clean_sheet'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).sum()
    )
    
    return df

# Compute form for last 5 and 10 games
team_history = compute_rolling_form(team_history, n_games=5)
team_history = compute_rolling_form(team_history, n_games=10)

print(f"âœ… Computed rolling form (5 & 10 games)")



âœ… Computed rolling form (5 & 10 games)


## 3. Compute Rolling Statistics



In [14]:
def compute_rolling_stats(team_history: pd.DataFrame, n_games: int = 5) -> pd.DataFrame:
    """Compute rolling averages for match statistics."""
    df = team_history.copy()
    grouped = df.groupby('teamId')
    
    stats_cols = ['possession', 'shots', 'shots_on_target', 'corners', 'fouls']
    
    for col in stats_cols:
        df[f'{col}_avg_{n_games}'] = grouped[col].transform(
            lambda x: x.shift(1).rolling(n_games, min_periods=1).mean()
        )
    
    return df

team_history = compute_rolling_stats(team_history, n_games=5)
print(f"âœ… Computed rolling stats (5 games)")



âœ… Computed rolling stats (5 games)


## 4. Compute Home/Away Specific Form



In [15]:
def compute_venue_specific_form(team_history: pd.DataFrame, n_games: int = 5) -> pd.DataFrame:
    """Compute form metrics specific to home or away matches."""
    df = team_history.copy()
    
    # Home-specific form
    home_df = df[df['is_home']].copy()
    home_grouped = home_df.groupby('teamId')
    
    home_df[f'home_form_wins_{n_games}'] = home_grouped['result'].transform(
        lambda x: (x == 'W').shift(1).rolling(n_games, min_periods=1).sum()
    )
    home_df[f'home_form_goals_{n_games}'] = home_grouped['goals_for'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).mean()
    )
    
    # Away-specific form
    away_df = df[~df['is_home']].copy()
    away_grouped = away_df.groupby('teamId')
    
    away_df[f'away_form_wins_{n_games}'] = away_grouped['result'].transform(
        lambda x: (x == 'W').shift(1).rolling(n_games, min_periods=1).sum()
    )
    away_df[f'away_form_goals_{n_games}'] = away_grouped['goals_for'].transform(
        lambda x: x.shift(1).rolling(n_games, min_periods=1).mean()
    )
    
    # Merge back
    home_cols = [f'home_form_wins_{n_games}', f'home_form_goals_{n_games}']
    away_cols = [f'away_form_wins_{n_games}', f'away_form_goals_{n_games}']
    
    df = df.merge(
        home_df[['eventId', 'teamId'] + home_cols],
        on=['eventId', 'teamId'],
        how='left'
    )
    df = df.merge(
        away_df[['eventId', 'teamId'] + away_cols],
        on=['eventId', 'teamId'],
        how='left'
    )
    
    return df

team_history = compute_venue_specific_form(team_history, n_games=5)
print(f"âœ… Computed home/away specific form")




âœ… Computed home/away specific form


## 5. Compute Head-to-Head Features



In [16]:
def compute_h2h_features(matches_df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute head-to-head history between teams for each match.
    Only uses matches BEFORE the current match date.
    """
    df = matches_df.copy()
    
    # Create H2H lookup
    h2h_records = []
    
    for idx, row in df.iterrows():
        home_id = row['homeTeamId']
        away_id = row['awayTeamId']
        match_date = row['date']
        
        # Find previous meetings
        prev_meetings = df[
            (df['date'] < match_date) &
            (
                ((df['homeTeamId'] == home_id) & (df['awayTeamId'] == away_id)) |
                ((df['homeTeamId'] == away_id) & (df['awayTeamId'] == home_id))
            )
        ]
        
        if len(prev_meetings) == 0:
            h2h_records.append({
                'eventId': row['eventId'],
                'h2h_matches': 0,
                'h2h_home_wins': 0,
                'h2h_away_wins': 0,
                'h2h_draws': 0,
                'h2h_avg_goals': np.nan
            })
        else:
            # Count results from home team's perspective
            home_wins = 0
            away_wins = 0
            draws = 0
            total_goals = 0
            
            for _, m in prev_meetings.iterrows():
                if m['homeTeamId'] == home_id:
                    if m['homeTeamScore'] > m['awayTeamScore']:
                        home_wins += 1
                    elif m['homeTeamScore'] < m['awayTeamScore']:
                        away_wins += 1
                    else:
                        draws += 1
                else:
                    if m['awayTeamScore'] > m['homeTeamScore']:
                        home_wins += 1
                    elif m['awayTeamScore'] < m['homeTeamScore']:
                        away_wins += 1
                    else:
                        draws += 1
                total_goals += m['homeTeamScore'] + m['awayTeamScore']
            
            h2h_records.append({
                'eventId': row['eventId'],
                'h2h_matches': len(prev_meetings),
                'h2h_home_wins': home_wins,
                'h2h_away_wins': away_wins,
                'h2h_draws': draws,
                'h2h_avg_goals': total_goals / len(prev_meetings)
            })
    
    h2h_df = pd.DataFrame(h2h_records)
    return df.merge(h2h_df, on='eventId', how='left')

# Note: H2H computation is slow for large datasets
# For production, pre-compute and cache
print("Computing H2H features (this may take a while for large datasets)...")

# For efficiency, only compute H2H for Tier 1 leagues
tier1_matches = matches[matches['tier'] == 1].copy()
if len(tier1_matches) > 0:
    tier1_with_h2h = compute_h2h_features(tier1_matches.head(1000))  # Sample for demo
    print(f"âœ… Computed H2H for {len(tier1_with_h2h)} Tier 1 matches (sample)")



Computing H2H features (this may take a while for large datasets)...
âœ… Computed H2H for 1000 Tier 1 matches (sample)


## 6. Compute Match Outcome Labels (Ground Truth)



In [17]:
def compute_outcome_labels(matches_df: pd.DataFrame) -> pd.DataFrame:
    """Compute match outcome labels for backtesting evaluation."""
    df = matches_df.copy()
    
    # Result (1X2)
    df['result'] = np.where(
        df['homeTeamScore'] > df['awayTeamScore'], 'H',
        np.where(df['homeTeamScore'] < df['awayTeamScore'], 'A', 'D')
    )
    
    # Total goals
    df['total_goals'] = df['homeTeamScore'] + df['awayTeamScore']
    
    # Over/Under
    df['over_0_5'] = (df['total_goals'] > 0.5).astype(int)
    df['over_1_5'] = (df['total_goals'] > 1.5).astype(int)
    df['over_2_5'] = (df['total_goals'] > 2.5).astype(int)
    df['over_3_5'] = (df['total_goals'] > 3.5).astype(int)
    
    # Both Teams to Score
    df['btts'] = ((df['homeTeamScore'] > 0) & (df['awayTeamScore'] > 0)).astype(int)
    
    # Clean sheets
    df['home_clean_sheet'] = (df['awayTeamScore'] == 0).astype(int)
    df['away_clean_sheet'] = (df['homeTeamScore'] == 0).astype(int)
    
    return df

matches = compute_outcome_labels(matches)
print(f"âœ… Computed outcome labels")
print(f"\nðŸ“Š Result Distribution:")
print(matches['result'].value_counts(normalize=True).round(3))
print(f"\nðŸ“Š Over 2.5 Goals: {matches['over_2_5'].mean()*100:.1f}%")
print(f"ðŸ“Š BTTS: {matches['btts'].mean()*100:.1f}%")



âœ… Computed outcome labels

ðŸ“Š Result Distribution:
result
H    0.452
A    0.299
D    0.249
Name: proportion, dtype: float64

ðŸ“Š Over 2.5 Goals: 49.6%
ðŸ“Š BTTS: 49.2%


## 7. Merge Features Back to Matches



In [18]:
def merge_team_features_to_matches(matches_df: pd.DataFrame, 
                                    team_history: pd.DataFrame) -> pd.DataFrame:
    """Merge computed team features back to the matches DataFrame."""
    
    # Feature columns to merge
    feature_cols = [col for col in team_history.columns if 'form_' in col or '_avg_' in col]
    
    # Home team features
    home_features = team_history[team_history['is_home']][['eventId', 'teamId'] + feature_cols].copy()
    home_features.columns = ['eventId', 'homeTeamId'] + [f'home_{col}' for col in feature_cols]
    
    # Away team features
    away_features = team_history[~team_history['is_home']][['eventId', 'teamId'] + feature_cols].copy()
    away_features.columns = ['eventId', 'awayTeamId'] + [f'away_{col}' for col in feature_cols]
    
    # Merge
    df = matches_df.merge(home_features, on=['eventId', 'homeTeamId'], how='left')
    df = matches_df.merge(away_features, on=['eventId', 'awayTeamId'], how='left')
    
    return df

# Merge features
matches_enriched = merge_team_features_to_matches(matches, team_history)
print(f"âœ… Merged team features to matches")
print(f"   Total columns: {len(matches_enriched.columns)}")



âœ… Merged team features to matches
   Total columns: 71


## 8. Save Enriched Data



In [19]:
# Save enriched matches
output_path = PROCESSED_DIR / 'matches_enriched.parquet'
matches_enriched.to_parquet(output_path, index=False)
print(f"âœ… Saved enriched matches to {output_path}")
print(f"   Shape: {matches_enriched.shape}")
print(f"   Size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")

# Save team history for reference
history_path = PROCESSED_DIR / 'team_history.parquet'
team_history.to_parquet(history_path, index=False)
print(f"âœ… Saved team history to {history_path}")



âœ… Saved enriched matches to ../data/processed/matches_enriched.parquet
   Shape: (83184, 71)
   Size: 3.71 MB
âœ… Saved team history to ../data/processed/team_history.parquet


## Summary

Features computed:
- **Form (5 & 10 games)**: wins, draws, losses, points, goals scored/conceded
- **Rolling stats (5 games)**: possession, shots, corners averages
- **Home/Away specific**: venue-specific form metrics
- **Outcomes**: result (1X2), over/under, BTTS, clean sheets

Next: `04_data_export.ipynb` for final validation and PostgreSQL export

