# Draft Minnester Rating Model - Baseline

This notebook implements a simple baseline model to predict match outcomes and generate team ratings.

## Phases:
1. Data Wrangling & Structure
2. Simple Feature Extraction
3. Model Architecture (Linear/Logistic Regression)
4. Training & Validation (Walk-Forward)
5. Draft Minnester Rating Output



In [None]:
import pandas as pd
import numpy as np
import sqlite3
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML Libraries - Simple baseline models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, brier_score_loss

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries loaded successfully")


## Phase 1: Data Wrangling & Structure


In [100]:
# Connect to databases
db_path = "football_database.sqlite"
connection = sqlite3.connect(db_path)

draft_db_path = "../draft_ministers.db"
conn_draft = sqlite3.connect(draft_db_path)

# Load all dataframes
df_match = pd.read_sql_query("SELECT * FROM Match", connection)
df_team = pd.read_sql_query("SELECT * FROM Team", connection)
df_team_attributes = pd.read_sql_query("SELECT * FROM Team_Attributes", connection)
df_player_attributes = pd.read_sql_query("SELECT * FROM Player_Attributes", connection)
df_player = pd.read_sql_query("SELECT * FROM Player", connection)
df_draft_teams = pd.read_sql_query("SELECT * FROM soccer_teams", conn_draft)

print(f"Loaded {len(df_match)} matches")
print(f"Loaded {len(df_draft_teams)} draft teams")

df_match.head()


Loaded 25979 matches
Loaded 20 draft teams


Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [101]:
# Step 1: Filter matches to only include teams in df_draft_teams
# Normalize team names for matching
df_team['team_name_normalized'] = df_team['team_long_name'].str.lower().str.strip()
df_draft_teams['name_normalized'] = df_draft_teams['name'].str.lower().str.strip()

# Get team_api_ids for draft teams
draft_team_ids = df_team[df_team['team_name_normalized'].isin(df_draft_teams['name_normalized'])]['team_api_id'].unique()
print(f"Found {len(draft_team_ids)} draft teams in football database")

# Filter matches where BOTH teams are in draft teams
df_match_filtered = df_match[
    (df_match['home_team_api_id'].isin(draft_team_ids)) & 
    (df_match['away_team_api_id'].isin(draft_team_ids))
].copy()

print(f"Filtered to {len(df_match_filtered)} matches with both teams in draft league")
print(f"Original matches: {len(df_match)}, Filtered: {len(df_match_filtered)}")


Found 11 draft teams in football database
Filtered to 512 matches with both teams in draft league
Original matches: 25979, Filtered: 512


In [102]:
# Step 2: Date Alignment - Convert all date columns to datetime
df_match_filtered['date'] = pd.to_datetime(df_match_filtered['date'])
df_team_attributes['date'] = pd.to_datetime(df_team_attributes['date'])
df_player_attributes['date'] = pd.to_datetime(df_player_attributes['date'])

# Sort matches by date for proper time-series processing
df_match_filtered = df_match_filtered.sort_values('date').reset_index(drop=True)

print(f"Date range: {df_match_filtered['date'].min()} to {df_match_filtered['date'].max()}")
print(f"Total matches: {len(df_match_filtered)}")

df_match_filtered.head()


Date range: 2008-08-17 00:00:00 to 2016-05-17 00:00:00
Total matches: 512


Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1733,1729,1729,2008/2009,1,2008-08-17,489046,10252,8456,4,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.3,3.75
1,1844,1729,1729,2008/2009,2,2008-08-23,489057,9879,9825,1,...,1.57,6.5,3.9,1.55,6.0,3.5,1.6,6.0,3.6,1.57
2,1953,1729,1729,2008/2009,3,2008-08-31,489066,10252,8650,0,...,2.38,2.85,3.25,2.55,2.75,3.2,2.5,2.75,3.25,2.4
3,2052,1729,1729,2008/2009,4,2008-09-13,489075,8456,8455,1,...,1.91,4.4,3.35,1.9,3.75,3.4,1.95,4.2,3.3,1.83
4,2049,1729,1729,2008/2009,4,2008-09-13,489072,8650,10260,2,...,2.63,3.1,3.1,2.5,2.85,3.0,2.6,2.8,3.0,2.5


In [103]:
# Step 3: Target Variable Encoding
# Primary Target: Match Result (Home Win / Draw / Away Win)
# Secondary Target: Goal Difference

def get_match_result(row):
    """Determine match result from goals"""
    home_goals = row.get('home_team_goal', 0)
    away_goals = row.get('away_team_goal', 0)
    
    if pd.isna(home_goals) or pd.isna(away_goals):
        return None
    
    if home_goals > away_goals:
        return 'Home Win'
    elif home_goals < away_goals:
        return 'Away Win'
    else:
        return 'Draw'

df_match_filtered['result'] = df_match_filtered.apply(get_match_result, axis=1)
df_match_filtered['goal_difference'] = df_match_filtered['home_team_goal'] - df_match_filtered['away_team_goal']

# Remove matches with missing results
df_match_filtered = df_match_filtered[df_match_filtered['result'].notna()].copy()

print(f"Match results distribution:")
print(df_match_filtered['result'].value_counts())
print(f"\nGoal difference stats:")
print(df_match_filtered['goal_difference'].describe())


Match results distribution:
result
Home Win    223
Away Win    157
Draw        132
Name: count, dtype: int64

Goal difference stats:
count    512.000000
mean       0.333984
std        1.860017
min       -6.000000
25%       -1.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: goal_difference, dtype: float64


## Phase 2: Simple Feature Extraction (No Complex Engineering)


In [104]:
# Simple feature extraction with MINIMAL historical context
# Add basic win rates and goal averages to give model team strength signal

print("Creating features with minimal historical context...")

# Create simple feature dataframe
df_features = df_match_filtered[['id', 'date', 'home_team_api_id', 'away_team_api_id', 
                                 'result', 'goal_difference', 'home_team_goal', 'away_team_goal']].copy()

# Add date features
df_features['year'] = pd.to_datetime(df_features['date']).dt.year
df_features['month'] = pd.to_datetime(df_features['date']).dt.month
df_features['day_of_week'] = pd.to_datetime(df_features['date']).dt.dayofweek

# Encode team IDs as categorical
df_features['home_team_id'] = df_features['home_team_api_id'].astype('category').cat.codes
df_features['away_team_id'] = df_features['away_team_api_id'].astype('category').cat.codes

# Add league ID if available
if 'league_id' in df_match_filtered.columns:
    df_features['league_id'] = df_match_filtered['league_id'].astype('category').cat.codes
else:
    df_features['league_id'] = 0

# Calculate MINIMAL historical features (win rates, goal averages)
# These are calculated from matches BEFORE the current match
print("Calculating historical team statistics...")

features_list = []
for idx, match in df_features.iterrows():
    if idx % 100 == 0:
        print(f"Processing match {idx}/{len(df_features)}")
    
    match_date = match['date']
    home_team = match['home_team_api_id']
    away_team = match['away_team_api_id']
    
    # Get historical matches for home team (BEFORE this match)
    home_prev = df_features[
        ((df_features['home_team_api_id'] == home_team) | (df_features['away_team_api_id'] == home_team)) &
        (df_features['date'] < match_date)
    ]
    
    # Get historical matches for away team (BEFORE this match)
    away_prev = df_features[
        ((df_features['home_team_api_id'] == away_team) | (df_features['away_team_api_id'] == away_team)) &
        (df_features['date'] < match_date)
    ]
    
    # Home team stats
    if len(home_prev) > 0:
        home_wins = 0
        home_goals_scored = []
        home_goals_conceded = []
        
        for _, prev_match in home_prev.iterrows():
            is_home = prev_match['home_team_api_id'] == home_team
            h_goals = prev_match['home_team_goal']
            a_goals = prev_match['away_team_goal']
            
            if is_home:
                home_goals_scored.append(h_goals)
                home_goals_conceded.append(a_goals)
                if h_goals > a_goals:
                    home_wins += 1
            else:
                home_goals_scored.append(a_goals)
                home_goals_conceded.append(h_goals)
                if a_goals > h_goals:
                    home_wins += 1
        
        home_win_rate = home_wins / len(home_prev) if len(home_prev) > 0 else 0.33
        home_avg_goals_scored = np.mean(home_goals_scored) if home_goals_scored else 1.0
        home_avg_goals_conceded = np.mean(home_goals_conceded) if home_goals_conceded else 1.0
    else:
        home_win_rate = 0.33  # Default to average
        home_avg_goals_scored = 1.0
        home_avg_goals_conceded = 1.0
    
    # Away team stats
    if len(away_prev) > 0:
        away_wins = 0
        away_goals_scored = []
        away_goals_conceded = []
        
        for _, prev_match in away_prev.iterrows():
            is_home = prev_match['home_team_api_id'] == away_team
            h_goals = prev_match['home_team_goal']
            a_goals = prev_match['away_team_goal']
            
            if is_home:
                away_goals_scored.append(h_goals)
                away_goals_conceded.append(a_goals)
                if h_goals > a_goals:
                    away_wins += 1
            else:
                away_goals_scored.append(a_goals)
                away_goals_conceded.append(h_goals)
                if a_goals > h_goals:
                    away_wins += 1
        
        away_win_rate = away_wins / len(away_prev) if len(away_prev) > 0 else 0.33
        away_avg_goals_scored = np.mean(away_goals_scored) if away_goals_scored else 1.0
        away_avg_goals_conceded = np.mean(away_goals_conceded) if away_goals_conceded else 1.0
    else:
        away_win_rate = 0.33
        away_avg_goals_scored = 1.0
        away_avg_goals_conceded = 1.0
    
    # Add to match row
    match_row = match.copy()
    match_row['home_win_rate'] = home_win_rate
    match_row['home_avg_goals_scored'] = home_avg_goals_scored
    match_row['home_avg_goals_conceded'] = home_avg_goals_conceded
    match_row['away_win_rate'] = away_win_rate
    match_row['away_avg_goals_scored'] = away_avg_goals_scored
    match_row['away_avg_goals_conceded'] = away_avg_goals_conceded
    
    features_list.append(match_row)

df_features = pd.DataFrame(features_list)

print(f"\nCreated {len(df_features)} feature rows")
print(f"Feature columns: {df_features.columns.tolist()}")
display(df_features.head())


Creating features with minimal historical context...
Calculating historical team statistics...
Processing match 0/512
Processing match 100/512
Processing match 200/512
Processing match 300/512
Processing match 400/512
Processing match 500/512

Created 512 feature rows
Feature columns: ['id', 'date', 'home_team_api_id', 'away_team_api_id', 'result', 'goal_difference', 'home_team_goal', 'away_team_goal', 'year', 'month', 'day_of_week', 'home_team_id', 'away_team_id', 'league_id', 'home_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'away_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded']


Unnamed: 0,id,date,home_team_api_id,away_team_api_id,result,goal_difference,home_team_goal,away_team_goal,year,month,day_of_week,home_team_id,away_team_id,league_id,home_win_rate,home_avg_goals_scored,home_avg_goals_conceded,away_win_rate,away_avg_goals_scored,away_avg_goals_conceded
0,1733,2008-08-17,10252,8456,Home Win,2,4,2,2008,8,6,9,2,0,0.33,1.0,1.0,0.33,1.0,1.0
1,1844,2008-08-23,9879,9825,Home Win,1,1,0,2008,8,5,8,6,0,0.33,1.0,1.0,0.33,1.0,1.0
2,1953,2008-08-31,10252,8650,Draw,0,0,0,2008,8,6,9,3,0,1.0,4.0,2.0,0.33,1.0,1.0
3,2052,2008-09-13,8456,8455,Away Win,-2,1,3,2008,9,5,2,1,0,0.0,2.0,4.0,0.33,1.0,1.0
4,2049,2008-09-13,8650,10260,Home Win,1,2,1,2008,9,5,3,10,0,0.0,0.0,0.0,0.33,1.0,1.0


In [105]:
# Feature summary
print("Simple features created:")
print(f"  - Team IDs (home/away): {df_features['home_team_id'].nunique()} unique teams")
print(f"  - Date features: year, month, day_of_week")
print(f"  - League ID: {df_features['league_id'].nunique()} unique leagues")
print(f"\nTotal features: {len([c for c in df_features.columns if c not in ['id', 'date', 'home_team_api_id', 'away_team_api_id', 'result', 'goal_difference']])}")


Simple features created:
  - Team IDs (home/away): 11 unique teams
  - Date features: year, month, day_of_week
  - League ID: 1 unique leagues

Total features: 14


In [106]:
# Display feature statistics
print("\nFeature statistics:")
display(df_features.describe())



Feature statistics:


Unnamed: 0,id,date,home_team_api_id,away_team_api_id,goal_difference,home_team_goal,away_team_goal,year,month,day_of_week,home_team_id,away_team_id,league_id,home_win_rate,home_avg_goals_scored,home_avg_goals_conceded,away_win_rate,away_avg_goals_scored,away_avg_goals_conceded
count,512.0,512,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0,512.0
mean,3309.580078,2012-08-31 10:41:15,9267.460938,9267.460938,0.333984,1.5625,1.228516,2012.154297,6.652344,4.541016,5.195312,5.195312,0.0,0.360532,1.354615,1.367769,0.35517,1.335273,1.35218
min,1733.0,2008-08-17 00:00:00,8191.0,8191.0,-6.0,0.0,0.0,2008.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2490.75,2010-07-21 12:00:00,8456.0,8456.0,-1.0,1.0,0.0,2010.0,3.0,5.0,2.0,2.0,0.0,0.272254,1.086051,1.176471,0.26087,1.05362,1.18464
50%,3349.5,2012-11-10 12:00:00,8673.0,8673.0,0.0,1.0,1.0,2012.0,8.0,5.0,4.5,4.5,0.0,0.375,1.443872,1.30532,0.351602,1.411146,1.315074
75%,4103.0,2014-10-05 00:00:00,9879.0,9879.0,1.0,2.0,2.0,2014.0,10.0,6.0,8.0,8.0,0.0,0.454545,1.589025,1.529871,0.448276,1.598661,1.52
max,4762.0,2016-05-17 00:00:00,10260.0,10260.0,8.0,8.0,6.0,2016.0,12.0,6.0,10.0,10.0,0.0,1.0,4.0,4.0,1.0,2.0,2.5
std,892.283752,,772.231797,772.231797,1.860017,1.328596,1.232521,2.385052,3.894428,1.856463,3.187693,3.187693,0.0,0.138247,0.355076,0.392273,0.136661,0.333059,0.323466


In [107]:
# Check for missing values
print(f"\nMissing values: {df_features.isnull().sum().sum()}")
if df_features.isnull().sum().sum() > 0:
    print("Missing values per column:")
    print(df_features.isnull().sum()[df_features.isnull().sum() > 0])



Missing values: 0


In [108]:
# Ready for modeling
print("\n✓ Simple feature extraction complete!")
print("Features ready for baseline linear models")



✓ Simple feature extraction complete!
Features ready for baseline linear models


In [109]:
# Features are already created above - this cell is just for reference
pass


In [110]:
# Final feature preparation
print("Final feature columns for modeling:")
feature_cols = [
    'home_team_id', 'away_team_id', 
    'year', 'month', 'day_of_week', 'league_id',
    'home_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded',
    'away_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded'
]
print(feature_cols)
print(f"\nTotal features: {len(feature_cols)}")
print("\nThis gives the model basic team strength signals (win rates, goal averages)")


Final feature columns for modeling:
['home_team_id', 'away_team_id', 'year', 'month', 'day_of_week', 'league_id', 'home_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'away_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded']

Total features: 12

This gives the model basic team strength signals (win rates, goal averages)


## Phase 3: Model Architecture (Simple Baseline)


In [111]:
# Prepare features and targets
# Use feature columns with historical context
feature_cols = [
    'home_team_id', 'away_team_id', 
    'year', 'month', 'day_of_week', 'league_id',
    'home_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded',
    'away_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded'
]

X = df_features[feature_cols].copy()
y_class = df_features['result'].copy()
y_reg = df_features['goal_difference'].copy()

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y_class)

print(f"Feature matrix shape: {X.shape}")
print(f"Features ({len(feature_cols)}): {feature_cols}")
print(f"Class distribution: {pd.Series(y_class).value_counts()}")
print(f"Classes: {le.classes_}")


Feature matrix shape: (512, 12)
Features (12): ['home_team_id', 'away_team_id', 'year', 'month', 'day_of_week', 'league_id', 'home_win_rate', 'home_avg_goals_scored', 'home_avg_goals_conceded', 'away_win_rate', 'away_avg_goals_scored', 'away_avg_goals_conceded']
Class distribution: result
Home Win    223
Away Win    157
Draw        132
Name: count, dtype: int64
Classes: ['Away Win' 'Draw' 'Home Win']


In [112]:
# Handle missing values and scale features for linear models
print(f"Missing values: {X.isnull().sum().sum()}")
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.median())
    print("Filled missing values with median")
else:
    print("No missing values - ready for modeling!")

# Scale features for linear models (important for convergence)
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print(f"\nFeatures scaled for linear models")
print(f"Feature ranges: min={X_scaled.min().min():.2f}, max={X_scaled.max().max():.2f}")


Missing values: 0
No missing values - ready for modeling!

Features scaled for linear models
Feature ranges: min=-4.18, max=7.46


## Phase 4: Train Simple Baseline Model


In [113]:
# Simple baseline: Train on all data
print("Training baseline models on all data...")
print(f"Total samples: {len(X_scaled)}")


Training baseline models on all data...
Total samples: 512


In [114]:
# Train models on all data
# Model 1: Logistic Regression for Classification
print("Training Logistic Regression Classifier...")
final_clf = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
final_clf.fit(X_scaled, y_encoded)
print("✓ Classifier trained")

# Model 2: Linear Regression for Goal Difference
print("Training Linear Regression...")
final_reg = LinearRegression()
final_reg.fit(X_scaled, y_reg)
print("✓ Regressor trained")

print("\nBaseline models trained successfully!")


Training Logistic Regression Classifier...
✓ Classifier trained
Training Linear Regression...
✓ Regressor trained

Baseline models trained successfully!


In [115]:
# Models are already trained above
print("Models ready for predictions")


Models ready for predictions


## Phase 5: Draft Minnester Rating (DMR) Output


In [116]:
def calculate_dmr(win_prob, draw_prob, loss_prob, goal_diff_pred, recent_form_bonus=0):
    """
    Calculate Draft Minnester Rating (DMR) score (0-100)
    
    Formula: DMR = (P(Win) × 100) + (P(Draw) × 33) + Recent Form Bonus
    """
    base_rating = (win_prob * 100) + (draw_prob * 33)
    
    # Add form bonus (capped at ±10 points)
    form_bonus = np.clip(recent_form_bonus, -10, 10)
    
    # Add goal difference adjustment (scaled)
    goal_diff_bonus = np.clip(goal_diff_pred * 5, -15, 15)
    
    dmr = base_rating + form_bonus + goal_diff_bonus
    
    # Ensure DMR is between 0 and 100
    dmr = np.clip(dmr, 0, 100)
    
    return dmr

print("DMR calculation function defined")


DMR calculation function defined


In [117]:
# Generate predictions and DMR for all matches
print("Generating predictions and DMR ratings...")

# Get predictions from final models (using scaled features)
y_pred_proba_all = final_clf.predict_proba(X_scaled)
y_pred_reg_all = final_reg.predict(X_scaled)

# Create results dataframe
df_predictions = df_features[['id', 'date', 'home_team_api_id', 'away_team_api_id', 
                              'result', 'goal_difference']].copy()
df_predictions.rename(columns={'id': 'match_id'}, inplace=True)

# Add predictions
df_predictions['pred_win_prob'] = y_pred_proba_all[:, le.transform(['Home Win'])[0]]
df_predictions['pred_draw_prob'] = y_pred_proba_all[:, le.transform(['Draw'])[0]]
df_predictions['pred_loss_prob'] = y_pred_proba_all[:, le.transform(['Away Win'])[0]]
df_predictions['pred_goal_diff'] = y_pred_reg_all

# Calculate DMR for home and away teams (no form bonus with simple model)
df_predictions['home_dmr'] = df_predictions.apply(
    lambda row: calculate_dmr(
        row['pred_win_prob'],
        row['pred_draw_prob'],
        row['pred_loss_prob'],
        row['pred_goal_diff'],
        recent_form_bonus=0  # No form bonus in simple model
    ), axis=1
)

# For away team, flip the probabilities
df_predictions['away_dmr'] = df_predictions.apply(
    lambda row: calculate_dmr(
        row['pred_loss_prob'],  # Away win = Home loss
        row['pred_draw_prob'],
        row['pred_win_prob'],   # Away loss = Home win
        -row['pred_goal_diff'],  # Flipped goal diff
        recent_form_bonus=0  # No form bonus in simple model
    ), axis=1
)

print(f"Generated predictions for {len(df_predictions)} matches")
display(df_predictions.head(10))


Generating predictions and DMR ratings...
Generated predictions for 512 matches


Unnamed: 0,match_id,date,home_team_api_id,away_team_api_id,result,goal_difference,pred_win_prob,pred_draw_prob,pred_loss_prob,pred_goal_diff,home_dmr,away_dmr
0,1733,2008-08-17,10252,8456,Home Win,2,0.519892,0.262649,0.217459,0.282718,62.070241,28.999706
1,1844,2008-08-23,9879,9825,Home Win,1,0.552692,0.261662,0.185647,0.549625,66.652116,24.451385
2,1953,2008-08-31,10252,8650,Draw,0,0.56953,0.382867,0.047603,1.740708,78.29113,8.691401
3,2052,2008-09-13,8456,8455,Away Win,-2,0.079662,0.267199,0.65314,-1.143551,11.065958,79.849291
4,2049,2008-09-13,8650,10260,Home Win,1,0.811568,0.065714,0.122719,1.416201,90.406313,7.359416
5,2068,2008-09-21,8455,10260,Draw,0,0.487875,0.488337,0.023788,2.191573,75.860493,7.536049
6,2072,2008-09-27,8668,8650,Away Win,-2,0.52578,0.280434,0.193786,0.421761,63.941123,26.524111
7,2088,2008-10-05,8455,10252,Home Win,2,0.504306,0.25636,0.239333,0.693408,62.35757,28.926176
8,2083,2008-10-05,8456,8650,Away Win,-1,0.058138,0.191064,0.750798,-1.562995,4.303937,89.199889
9,2090,2008-10-18,9825,8668,Home Win,2,0.663002,0.205098,0.1319,1.132137,78.729096,14.29756


In [118]:
# Create team rating summary
team_ratings = []

for team_id in draft_team_ids:
    team_matches = df_predictions[
        (df_predictions['home_team_api_id'] == team_id) | 
        (df_predictions['away_team_api_id'] == team_id)
    ]
    
    if len(team_matches) == 0:
        continue
    
    # Get team name
    team_name = df_team[df_team['team_api_id'] == team_id]['team_long_name'].values
    team_name = team_name[0] if len(team_name) > 0 else f"Team {team_id}"
    
    # Calculate average DMR (weighted by recency)
    home_matches = team_matches[team_matches['home_team_api_id'] == team_id]
    away_matches = team_matches[team_matches['away_team_api_id'] == team_id]
    
    home_dmrs = home_matches['home_dmr'].values if len(home_matches) > 0 else []
    away_dmrs = away_matches['away_dmr'].values if len(away_matches) > 0 else []
    
    all_dmrs = list(home_dmrs) + list(away_dmrs)
    
    if len(all_dmrs) > 0:
        # Recent matches weighted more (last 10 matches)
        recent_matches = team_matches.sort_values('date').tail(10)
        recent_dmrs = []
        for _, match in recent_matches.iterrows():
            if match['home_team_api_id'] == team_id:
                recent_dmrs.append(match['home_dmr'])
            else:
                recent_dmrs.append(match['away_dmr'])
        
        avg_dmr = np.mean(all_dmrs)
        recent_avg_dmr = np.mean(recent_dmrs) if recent_dmrs else avg_dmr
        
        team_ratings.append({
            'team_api_id': team_id,
            'team_name': team_name,
            'avg_dmr': avg_dmr,
            'recent_dmr': recent_avg_dmr,
            'total_matches': len(team_matches)
        })

df_team_ratings = pd.DataFrame(team_ratings).sort_values('recent_dmr', ascending=False)

print("Team Ratings (Draft Minnester Rating):")
print("="*60)
display(df_team_ratings)


Team Ratings (Draft Minnester Rating):


Unnamed: 0,team_api_id,team_name,avg_dmr,recent_dmr,total_matches
2,8650,Liverpool,60.078802,65.417506,120
4,8456,Manchester City,53.438472,61.15749,120
7,8455,Chelsea,55.823629,57.570183,120
0,10260,Manchester United,51.223032,56.773784,120
1,9825,Arsenal,48.571583,50.48345,120
5,8668,Everton,42.699252,49.521265,120
9,9826,Crystal Palace,25.275332,31.331097,48
6,9879,Fulham,33.585799,27.487048,88
3,10252,Aston Villa,33.666636,25.573217,120
10,8678,Bournemouth,22.739878,25.308023,16


In [119]:
# Function to predict a match outcome
def predict_match(home_team_id, away_team_id, match_date=None):
    """
    Predict outcome for a specific match using features with historical context
    
    Args:
        home_team_id: Home team API ID
        away_team_id: Away team API ID
        match_date: Match date (datetime), if None uses most recent date
    
    Returns:
        Dictionary with predictions and DMR
    """
    if match_date is None:
        match_date = df_features['date'].max()
    else:
        match_date = pd.to_datetime(match_date)
    
    # Get team names
    home_name = df_team[df_team['team_api_id'] == home_team_id]['team_long_name'].values
    home_name = home_name[0] if len(home_name) > 0 else f"Team {home_team_id}"
    away_name = df_team[df_team['team_api_id'] == away_team_id]['team_long_name'].values
    away_name = away_name[0] if len(away_name) > 0 else f"Team {away_team_id}"
    
    # Get team ID codes (must match encoding from df_features)
    home_team_code = df_features[df_features['home_team_api_id'] == home_team_id]['home_team_id'].values
    if len(home_team_code) == 0:
        home_team_code = df_features['home_team_id'].median()
    else:
        home_team_code = home_team_code[0]
    
    away_team_code = df_features[df_features['away_team_api_id'] == away_team_id]['away_team_id'].values
    if len(away_team_code) == 0:
        away_team_code = df_features['away_team_id'].median()
    else:
        away_team_code = away_team_code[0]
    
    # Calculate historical features (from matches BEFORE match_date)
    home_prev = df_features[
        ((df_features['home_team_api_id'] == home_team_id) | (df_features['away_team_api_id'] == home_team_id)) &
        (df_features['date'] < match_date)
    ]
    
    away_prev = df_features[
        ((df_features['home_team_api_id'] == away_team_id) | (df_features['away_team_api_id'] == away_team_id)) &
        (df_features['date'] < match_date)
    ]
    
    # Home team historical stats
    if len(home_prev) > 0:
        home_wins = sum(1 for _, m in home_prev.iterrows() 
                      if (m['home_team_api_id'] == home_team_id and m['home_team_goal'] > m['away_team_goal']) or
                         (m['away_team_api_id'] == home_team_id and m['away_team_goal'] > m['home_team_goal']))
        home_win_rate = home_wins / len(home_prev)
        home_goals_scored = [m['home_team_goal'] if m['home_team_api_id'] == home_team_id else m['away_team_goal'] 
                            for _, m in home_prev.iterrows()]
        home_goals_conceded = [m['away_team_goal'] if m['home_team_api_id'] == home_team_id else m['home_team_goal'] 
                              for _, m in home_prev.iterrows()]
        home_avg_goals_scored = np.mean(home_goals_scored)
        home_avg_goals_conceded = np.mean(home_goals_conceded)
    else:
        home_win_rate = 0.33
        home_avg_goals_scored = 1.0
        home_avg_goals_conceded = 1.0
    
    # Away team historical stats
    if len(away_prev) > 0:
        away_wins = sum(1 for _, m in away_prev.iterrows() 
                      if (m['home_team_api_id'] == away_team_id and m['home_team_goal'] > m['away_team_goal']) or
                         (m['away_team_api_id'] == away_team_id and m['away_team_goal'] > m['home_team_goal']))
        away_win_rate = away_wins / len(away_prev)
        away_goals_scored = [m['home_team_goal'] if m['home_team_api_id'] == away_team_id else m['away_team_goal'] 
                            for _, m in away_prev.iterrows()]
        away_goals_conceded = [m['away_team_goal'] if m['home_team_api_id'] == away_team_id else m['home_team_goal'] 
                              for _, m in away_prev.iterrows()]
        away_avg_goals_scored = np.mean(away_goals_scored)
        away_avg_goals_conceded = np.mean(away_goals_conceded)
    else:
        away_win_rate = 0.33
        away_avg_goals_scored = 1.0
        away_avg_goals_conceded = 1.0
    
    # Create feature row with historical context
    X_pred = pd.DataFrame({
        'home_team_id': [home_team_code],
        'away_team_id': [away_team_code],
        'year': [match_date.year],
        'month': [match_date.month],
        'day_of_week': [match_date.dayofweek],
        'league_id': [df_features['league_id'].iloc[0]],
        'home_win_rate': [home_win_rate],
        'home_avg_goals_scored': [home_avg_goals_scored],
        'home_avg_goals_conceded': [home_avg_goals_conceded],
        'away_win_rate': [away_win_rate],
        'away_avg_goals_scored': [away_avg_goals_scored],
        'away_avg_goals_conceded': [away_avg_goals_conceded]
    })
    
    # Scale features (using the same scaler from training)
    X_pred_scaled = pd.DataFrame(
        scaler.transform(X_pred),
        columns=X_pred.columns
    )
    
    # Predict
    proba = final_clf.predict_proba(X_pred_scaled)[0]
    goal_diff = final_reg.predict(X_pred_scaled)[0]
    
    win_prob = proba[le.transform(['Home Win'])[0]]
    draw_prob = proba[le.transform(['Draw'])[0]]
    loss_prob = proba[le.transform(['Away Win'])[0]]
    
    # Calculate DMR
    home_dmr = calculate_dmr(win_prob, draw_prob, loss_prob, goal_diff, 0)
    away_dmr = calculate_dmr(loss_prob, draw_prob, win_prob, -goal_diff, 0)
    
    return {
        'home_team': home_name,
        'away_team': away_name,
        'home_win_prob': win_prob,
        'draw_prob': draw_prob,
        'away_win_prob': loss_prob,
        'predicted_goal_diff': goal_diff,
        'home_dmr': home_dmr,
        'away_dmr': away_dmr,
        'prediction': 'Home Win' if win_prob > max(draw_prob, loss_prob) 
                     else 'Away Win' if loss_prob > draw_prob 
                     else 'Draw'
    }

print("Match prediction function defined")
print("\nExample prediction:")
if len(draft_team_ids) >= 2:
    example = predict_match(draft_team_ids[0], draft_team_ids[1])
    for key, value in example.items():
        print(f"  {key}: {value}")


Match prediction function defined

Example prediction:
  home_team: Manchester United
  away_team: Arsenal
  home_win_prob: 0.5163406111399255
  draw_prob: 0.26989691712421426
  away_win_prob: 0.2137624717358603
  predicted_goal_diff: 0.47890779148877444
  home_dmr: 62.935198336535485
  away_dmr: 27.88830648124123
  prediction: Home Win


In [120]:
# Feature coefficients (for Logistic Regression)
print("Feature Coefficients (Logistic Regression):")
# Get coefficients for each class
coef_df = pd.DataFrame(
    final_clf.coef_,
    columns=feature_cols,
    index=le.classes_
).T

# Calculate average absolute coefficient as importance measure
coef_df['avg_abs_coef'] = coef_df.abs().mean(axis=1)
coef_df = coef_df.sort_values('avg_abs_coef', ascending=False)

print("\nTop features by average absolute coefficient:")
display(coef_df[['avg_abs_coef']].head(10))


Feature Coefficients (Logistic Regression):

Top features by average absolute coefficient:


Unnamed: 0,avg_abs_coef
home_win_rate,0.31158
home_avg_goals_conceded,0.292105
home_avg_goals_scored,0.243829
away_avg_goals_scored,0.166026
day_of_week,0.092474
year,0.080479
home_team_id,0.055756
month,0.047398
away_team_id,0.036094
away_win_rate,0.032658


In [121]:
# Close database connections
connection.close()
conn_draft.close()
print("Database connections closed")


Database connections closed
