# Soccer Forecasting Project

## Setup

### Libraries

In [76]:
import pandas as pd
import os
from dotenv import load_dotenv
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, log_loss, accuracy_score
from datetime import date
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
# from eval import time_based_kfold

In [2]:
load_dotenv()

True

### Load Data

In [3]:
os.environ.get("SOCCER_DATA_PATH")

'/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data'

In [4]:
# Get data path from environment variable or use default
data_path = os.environ.get("SOCCER_DATA_PATH")

# Print the data path to debug
print(f"Data path: {data_path}")

# Use the correct path to load the files
elo_data = pd.read_csv(os.path.join(data_path, "EloRatings.csv"))
matches = pd.read_csv(os.path.join(data_path, "Matches.csv"), low_memory=False)

Data path: /Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data


In [5]:
# Our rows are going to be the matches. For each match, we have the following information:
# - Date of the match
# - Home Team
# - Away Team
# - Home Team Goals
# - Away Team Goals
# - Home Team Elo Rating on the most recent date before the match
# - Away Team Elo Rating on the most recent date before the match``

### Process and Filter Matches

In [6]:
matches.columns

Index(['Division', 'MatchDate', 'MatchTime', 'HomeTeam', 'AwayTeam', 'HomeElo',
       'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'FTHome',
       'FTAway', 'FTResult', 'HTHome', 'HTAway', 'HTResult', 'HomeShots',
       'AwayShots', 'HomeTarget', 'AwayTarget', 'HomeFouls', 'AwayFouls',
       'HomeCorners', 'AwayCorners', 'HomeYellow', 'AwayYellow', 'HomeRed',
       'AwayRed', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw',
       'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize',
       'HandiHome', 'HandiAway'],
      dtype='object')

In [7]:
matches.FTResult.value_counts()

FTResult
H    101928
A     65870
D     60576
Name: count, dtype: int64

In [8]:
matches = \
(matches.assign(
    date = pd.to_datetime(matches.MatchDate),
    day = pd.to_datetime(matches.MatchDate).dt.day,
    month = pd.to_datetime(matches.MatchDate).dt.month,
    year = pd.to_datetime(matches.MatchDate).dt.year
).sort_values(by='date', ascending=True)
)

In [9]:
def filter_matches(matches, matches_rel_cols, filter_start_date, filter_end_date, divisions_list):
    matches_col_filtered = matches[matches_rel_cols]
    # Apply both filters using logical AND (&)
    filtered_matches = matches_col_filtered[
        (matches_col_filtered.date >= filter_start_date) & (matches_col_filtered.date <= filter_end_date) & 
        (matches_col_filtered.Division.isin(divisions_list))
    ]

    return filtered_matches

In [10]:
matches_rel_cols = ['Division', 'date', 'day', 'month', 'year', 'HomeElo', 'AwayElo', 'AwayTeam', 'FTResult']

filter_start_date = '2023-08-01'
filter_end_date = '2024-07-31'
divisions_list = ['E0']

filtered_matches = filter_matches(matches, matches_rel_cols, filter_start_date, filter_end_date, divisions_list)

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult
210362,E0,2023-08-11,11,8,2023,1726.42,2077.27,Man City,A
210454,E0,2023-08-12,12,8,2023,1876.02,1828.01,Aston Villa,H
210378,E0,2023-08-12,12,8,2023,1644.08,1757.41,Crystal Palace,A
210377,E0,2023-08-12,12,8,2023,1708.25,1736.65,Fulham,A
210376,E0,2023-08-12,12,8,2023,1828.20,1606.54,Luton,H
...,...,...,...,...,...,...,...,...,...
221144,E0,2024-05-19,19,5,2024,1798.52,1695.08,Bournemouth,H
221142,E0,2024-05-19,19,5,2024,1732.68,1759.74,Man United,A
221141,E0,2024-05-19,19,5,2024,1721.75,1798.56,Newcastle,A
221140,E0,2024-05-19,19,5,2024,1944.71,1709.04,Everton,H


### Additional Feature Generation

#### Elo Difference

In [11]:
filtered_matches['EloDiff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult,EloDiff
210362,E0,2023-08-11,11,8,2023,1726.42,2077.27,Man City,A,-350.85
210454,E0,2023-08-12,12,8,2023,1876.02,1828.01,Aston Villa,H,48.01
210378,E0,2023-08-12,12,8,2023,1644.08,1757.41,Crystal Palace,A,-113.33
210377,E0,2023-08-12,12,8,2023,1708.25,1736.65,Fulham,A,-28.40
210376,E0,2023-08-12,12,8,2023,1828.20,1606.54,Luton,H,221.66
...,...,...,...,...,...,...,...,...,...,...
221144,E0,2024-05-19,19,5,2024,1798.52,1695.08,Bournemouth,H,103.44
221142,E0,2024-05-19,19,5,2024,1732.68,1759.74,Man United,A,-27.06
221141,E0,2024-05-19,19,5,2024,1721.75,1798.56,Newcastle,A,-76.81
221140,E0,2024-05-19,19,5,2024,1944.71,1709.04,Everton,H,235.67


#### Team Market Value

In [None]:
# TO IMPLEMENT

## Modeling

**General**

We have a dataset of historical soccer matches. Each row is a single match and the columns contain the name of the home team, the name of the away team and the final result of the match (H for Home Win, D for Draw, A for Away Win).

We aim to use this historical data, as well as a set of features we think might be informative, to predict match outcomes for the upcoming 25/26 season.

**Current**

We have the Elo rating (add reference) for each team at the moment the games were played. We also have the current Elo rating for the teams whose match outcomes we hope to predict.

Therefore, the features we can use in the model are Home Elo, Away Elo and EloDiff, as well as any desired transformations of these.

### Data Split

In [44]:
def train_test_split_pipeline(filtered_matches, target, random_state=42, val_size=0.15, test_size=0.15):

    X = filtered_matches.drop(columns=[target])
    y = filtered_matches[target]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_size + test_size), random_state=random_state, stratify=y)

    val_test_ratio = val_size / (val_size + test_size)

    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(1 - val_test_ratio), random_state=random_state, stratify=y_temp)

    return X_train, X_val, X_test, y_train, y_val, y_test

### Baseline

#### 1. Heuristic baselines

In [46]:
def generate_heuristic_predictions(X_test, condition):
    # Apply the specified heuristic condition
    X_test['predicted_outcome'] = X_test.apply(lambda row: condition(row), axis=1)
    return X_test['predicted_outcome']

##### 1a) Predict higher Elo team to win with 100% probability

In [15]:
# # Example usage:
# condition = lambda row: "H" if row['HomeElo'] > row['AwayElo'] else "A" if row['HomeElo'] < row['AwayElo'] else "D"
# generate_heuristic_predictions(X_test, condition)

### Non-ordinal

Technically, modeling soccer match outcomes is an ordinal regression problem i.e. the outcomes have an inherent ordering.

Treating this task as an ordinal regression problem instead of a multiclass classification problem seems more theoretically appropriate and enhances interpretability. However, only a full training and evaluation pipeline for both classes of methods can tell us whether it actually improves prediction performance.

In [47]:
features = ['HomeElo', 'AwayElo', 'EloDiff']
# features = ['EloDiff']

In [48]:
def generate_model_predictions(X_train, X_test, y_train, features, model):
    X_train, X_test = X_train[features], X_test[features]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predicted_probabilities = model.predict_proba(X_test)
    return y_pred, predicted_probabilities

### Ordinal

In [18]:
# TO IMPLEMENT

In [19]:
# # Create outcome encoding (A=0, D=1, H=2)
# outcome_mapping = {'A': 0, 'D': 1, 'H': 2}
# filtered_matches['outcome_enc'] = filtered_matches['FTResult'].map(outcome_mapping)

# # Calculate Elo difference (Home - Away)
# filtered_matches['elo_diff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

# # Create and fit the ordinal logistic regression model
# model = OrderedModel(
#     filtered_matches['outcome_enc'],
#     filtered_matches[['elo_diff']], # Removed sm.add_constant()
#     distr='logit'
# )

# result = model.fit(method='bfgs')
# print(result.summary())

In [20]:
# filtered_matches_with_predictions = filtered_matches.copy()

# # Get predicted probabilities for each outcome
# predicted_probs = model.predict(result.params, exog=filtered_matches[['elo_diff']])

# # Add probabilities to the dataframe
# filtered_matches_with_predictions['prob_away'] = predicted_probs[:, 0]  # Probability of Away win
# filtered_matches_with_predictions['prob_draw'] = predicted_probs[:, 1]  # Probability of Draw
# filtered_matches_with_predictions['prob_home'] = predicted_probs[:, 2]  # Probability of Home win

# # Display example predictions with actual results
# results_df = filtered_matches_with_predictions[['date', 'HomeElo', 'AwayElo', 'elo_diff', 'FTResult', 
#                              'prob_away', 'prob_draw', 'prob_home']].round(3)
# print(results_df.head())

# # Optional: Verify that probabilities sum to 1
# print("\nVerifying probabilities sum to 1:")
# print(results_df[['prob_away', 'prob_draw', 'prob_home']].sum(axis=1).head())

## Evaluation

### Standard

In [89]:
def evaluator(y_test, y_pred, output_dict = True):
    eval_dict = {}
    eval_dict['classification_report'] = classification_report(y_test, y_pred, labels=['H','D','A'], zero_division = 0, output_dict=output_dict)
    eval_dict['confusion_matrix'] = confusion_matrix(y_test, y_pred, labels=['H','D','A'])
    return eval_dict

In [90]:
def full_pipeline(filtered_matches, prediction_type, prediction_params, features, target = 'FTResult', output_dict = True):
    X_train, X_val, X_test, y_train, y_val, y_test = train_test_split_pipeline(filtered_matches, target)
    if prediction_type == 'model':
        model = prediction_params['model']
        y_pred, predicted_probabilities = generate_model_predictions(X_train, X_test, y_train, features, model)
        eval_dict = evaluator(y_test, y_pred, output_dict=output_dict)
        return eval_dict, predicted_probabilities
    
    elif prediction_type == 'heuristic':
        condition = prediction_params['condition']
        y_pred = generate_heuristic_predictions(X_test, condition)
        eval_dict = evaluator(y_test, y_pred, output_dict=output_dict)
        return eval_dict, None
    else:
        raise ValueError(f"Invalid prediction type: {prediction_type}")

#### Heuristic Methods Evaluation

In [106]:
def heuristic_function_generator(win_threshold=float('-inf'), loss_threshold=float('-inf')):
    return lambda row: "H" if row['EloDiff'] > win_threshold else "A" if row['EloDiff'] < loss_threshold else "D"

In [122]:
conditions_dict = {
    'condition_1': heuristic_function_generator(),
    'condition_2': heuristic_function_generator(win_threshold=0),
    'condition_3': heuristic_function_generator(win_threshold=50, loss_threshold=-50),
    'condition_4': heuristic_function_generator(win_threshold=100, loss_threshold=-100),
    'condition_5': heuristic_function_generator(win_threshold=0, loss_threshold=-100),
    'condition_6': heuristic_function_generator(win_threshold=-50, loss_threshold=-100),
    'condition_7': heuristic_function_generator(win_threshold=25, loss_threshold=-25),
    'condition_8': heuristic_function_generator(win_threshold=25, loss_threshold=-50)
}

In [123]:
# Print accuracy
for condition in conditions_dict.values():
    print(full_pipeline(filtered_matches, 'heuristic', {'condition': condition}, features)[0]['classification_report']['accuracy'])

0.45614035087719296
0.45614035087719296
0.543859649122807
0.47368421052631576
0.543859649122807
0.5263157894736842
0.5964912280701754
0.5964912280701754


In [124]:
# Detailed evaluation of each heuristic
for condition in conditions_dict.values():
    print(full_pipeline(filtered_matches, 'heuristic', {'condition': condition}, features)[0]['classification_report'])

{'H': {'precision': 0.45614035087719296, 'recall': 1.0, 'f1-score': 0.6265060240963856, 'support': 26.0}, 'D': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 12.0}, 'A': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 19.0}, 'accuracy': 0.45614035087719296, 'macro avg': {'precision': 0.15204678362573099, 'recall': 0.3333333333333333, 'f1-score': 0.20883534136546186, 'support': 57.0}, 'weighted avg': {'precision': 0.2080640196983687, 'recall': 0.45614035087719296, 'f1-score': 0.2857746776580004, 'support': 57.0}}
{'H': {'precision': 0.625, 'recall': 0.7692307692307693, 'f1-score': 0.6896551724137931, 'support': 26.0}, 'D': {'precision': 0.24, 'recall': 0.5, 'f1-score': 0.32432432432432434, 'support': 12.0}, 'A': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 19.0}, 'accuracy': 0.45614035087719296, 'macro avg': {'precision': 0.28833333333333333, 'recall': 0.4230769230769231, 'f1-score': 0.3379931655793725, 'support': 57.0}, 'weighted avg': {'pr

#### ML Methods Eval

In [77]:
def compare_models(filtered_matches, features):
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
        'SVM': SVC(probability=True, random_state=42, class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'XGB': XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.1,random_state=42,eval_metric='mlogloss'),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(50,), random_state=42),
        'NB': GaussianNB(),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'CatBoost': CatBoostClassifier(
            loss_function='MultiClass',
            eval_metric='TotalF1',        # or 'MultiClass'
            class_names=['H','D','A'],    # keeps string labels consistent
            auto_class_weights='Balanced',
            random_seed=42,
            verbose=False
        ),
    }
    
    results = {}
    for name, model in models.items():
        try:
            eval_dict, _ = full_pipeline(filtered_matches, 'model', {'model': model}, features)
            # Extract accuracy from classification report
            report = eval_dict['classification_report']
            # Parse accuracy from the report
            accuracy_line = [line for line in report.split('\n') if 'accuracy' in line][0]
            accuracy = float(accuracy_line.split()[-2])
            results[name] = accuracy
            print(f"{name}: {accuracy:.3f}")
        except Exception as e:
            print(f"Error with {name}: {e}")
    
    return results

# Run comparison
model_comparison = compare_models(filtered_matches, features)

Logistic Regression: 0.560
Random Forest: 0.510
SVM: 0.440
Gradient Boosting: 0.630
Error with XGB: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['A' 'D' 'H']
Neural Network: 0.560
NB: 0.540
KNN: 0.560
Decision Tree: 0.490
CatBoost: 0.560


After evaluating many different models, I look at the detailed evaluation metrics for the models that had the highest accuracy.

In [73]:
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,
    max_iter=200
)

print(full_pipeline(filtered_matches, 'model', {'model': model}, features)[0]['classification_report'])

              precision    recall  f1-score   support

           H       0.57      0.81      0.67        26
           D       0.00      0.00      0.00        12
           A       0.55      0.58      0.56        19

    accuracy                           0.56        57
   macro avg       0.37      0.46      0.41        57
weighted avg       0.44      0.56      0.49        57





In [74]:
model = GradientBoostingClassifier(n_estimators=100, random_state=42)

print(full_pipeline(filtered_matches, 'model', {'model': model}, features)[0]['classification_report'])

              precision    recall  f1-score   support

           H       0.61      0.85      0.71        26
           D       0.50      0.25      0.33        12
           A       0.73      0.58      0.65        19

    accuracy                           0.63        57
   macro avg       0.61      0.56      0.56        57
weighted avg       0.63      0.63      0.61        57



### K-Fold Cross Validation

In [28]:
# TO IMPLEMENT

### Nested Cross Validation

In [29]:
# TO IMPLEMENT

### Time-based cross validation

In [30]:
# TO IMPLEMENT

In [32]:
# # Sort data chronologically
# filtered_matches = filtered_matches.sort_values('date')

# # First, create a true held-out test set (e.g., last 20% of the data)
# test_size = int(len(filtered_matches) * 0.2)
# train_val_data = filtered_matches.iloc[:-test_size].copy()
# test_data = filtered_matches.iloc[-test_size:].copy()

# print(f"Full dataset size: {len(filtered_matches)}")
# print(f"Train+Validation set size: {len(train_val_data)}")
# print(f"Test set size: {len(test_data)}")
# print(f"Test set period: {test_data.date.min()} to {test_data.date.max()}")

## Output

### Load Elo data

In [33]:
from club_elo_api import get_daily_ranking

ranking = get_daily_ranking(date(2025, 7, 29))
if ranking is None:
    print(f"Failed to fetch daily ranking")
    ranking = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/elo_2025_07_29.csv')

Error making request: HTTPConnectionPool(host='api.clubelo.com', port=80): Read timed out. (read timeout=15)
Failed to fetch daily ranking


In [34]:
ranking

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1.0,Liverpool,ENG,1,1993.417725,2025-05-29,2025-08-15
1,2.0,Arsenal,ENG,1,1993.340210,2025-05-29,2025-08-17
2,3.0,Paris SG,FRA,1,1974.937134,2025-06-01,2025-08-17
3,4.0,Man City,ENG,1,1959.944092,2025-06-01,2025-08-16
4,5.0,Barcelona,ESP,1,1945.430664,2025-06-01,2025-08-16
...,...,...,...,...,...,...,...
624,,St Josephs,GIB,0,945.951111,2025-07-25,2025-07-31
625,,FCB Magpies,GIB,0,910.830078,2025-07-23,2025-08-25
626,,Tre Fiori,SMR,0,707.442810,2025-07-18,2025-08-25
627,,SS Virtus,SMR,0,706.482422,2025-07-18,2025-08-25


### Load fixtures and pre-process

In [35]:
fixtures = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/epl_fixtures.csv')

fixtures

Unnamed: 0,home_team,away_team,date,time
0,Liverpool,AFC Bournemouth,2025-08-15,20:00
1,Aston Villa,Newcastle United,2025-08-16,15:00
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00
3,Sunderland,West Ham United,2025-08-16,15:00
4,Tottenham Hotspur,Burnley,2025-08-16,15:00
...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD
377,Sunderland,Chelsea,2026-05-24,TBD
378,Tottenham Hotspur,Everton,2026-05-24,TBD


In [36]:
ranking_epl_teams = ranking[(ranking['Country'] == 'ENG') & (ranking['Level'] == 1)]['Club']

missing_teams_fixtures = set(fixtures.home_team.unique()) - set(ranking_epl_teams)
print(sorted(missing_teams_fixtures))

missing_teams_elo = set(set(ranking_epl_teams) - set(fixtures.home_team.unique()))
print(sorted(missing_teams_elo))

['AFC Bournemouth', 'Brighton & Hove Albion', 'Leeds United', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Tottenham Hotspur', 'West Ham United']
['Bournemouth', 'Forest', 'Leeds', 'Man City', 'Man United', 'Newcastle', 'Tottenham', 'West Ham']


In [37]:
name_differences_mapping = {
    'Tottenham': 'Tottenham Hotspur',
    'Forest': 'Nottingham Forest',
    'Man United': 'Manchester United',
    'Leeds': 'Leeds United',
    'West Ham': 'West Ham United',
    'Man City': 'Manchester City',
    'Bournemouth': 'AFC Bournemouth',
    'Newcastle': 'Newcastle United',
    'Brighton': 'Brighton & Hove Albion'
}

In [38]:
ranking_copy = ranking.copy()

ranking_copy['Club'] = ranking_copy['Club'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['home_team'] = fixtures['home_team'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['away_team'] = fixtures['away_team'].apply(lambda x: name_differences_mapping.get(x, x))

### Merge fixtures with features for prediction

In [39]:
fixtures_with_elo = (
    fixtures
    .merge(ranking_copy[['Club', 'Elo']], left_on='home_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'HomeElo'})
    .drop(columns=['Club'])
    .merge(ranking_copy[['Club', 'Elo']], left_on='away_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'AwayElo'})
    .drop(columns=['Club'])
)

fixtures_with_elo['EloDiff'] = fixtures_with_elo['HomeElo'] - fixtures_with_elo['AwayElo']

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417725,1808.095215,185.322510
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552612,4.286621
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447
3,Sunderland,West Ham United,2025-08-16,15:00,1547.099121,1750.132446,-203.033325
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058
...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944092,1872.839233,87.104858
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095215,-5.227173
377,Sunderland,Chelsea,2026-05-24,TBD,1547.099121,1902.801147,-355.702026
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979858,-19.973145


In [40]:
print(fixtures_with_elo.HomeElo.isna().sum())
print(fixtures_with_elo.AwayElo.isna().sum())

0
0


### Generate match predictions

In [41]:
# model.predict_proba(fixtures_with_elo[['HomeElo', 'AwayElo']])
# model.predict(fixtures_with_elo[['HomeElo', 'AwayElo']])

In [42]:
# Get predictions with labeled columns and reorder to Home, Draw, Away
prob_df = pd.DataFrame(model.predict_proba(fixtures_with_elo[features]), columns=['A', 'D', 'H'])[['H', 'D', 'A']] # Ensure output is in desired order

# Assign probabilities to fixtures_with_elo
fixtures_with_elo['home_win_prob'], fixtures_with_elo['draw_prob'], fixtures_with_elo['away_win_prob'] = prob_df['H'], prob_df['D'], prob_df['A']

fixtures_with_elo['predicted_outcome'] = model.predict(fixtures_with_elo[features])

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff,home_win_prob,draw_prob,away_win_prob,predicted_outcome
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417725,1808.095215,185.322510,0.740977,0.196160,0.062864,H
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552612,4.286621,0.514721,0.250171,0.235108,H
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447,0.555402,0.248072,0.196526,H
3,Sunderland,West Ham United,2025-08-16,15:00,1547.099121,1750.132446,-203.033325,0.154575,0.172835,0.672590,A
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058,0.538520,0.250786,0.210694,H
...,...,...,...,...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944092,1872.839233,87.104858,0.641178,0.230559,0.128263,H
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095215,-5.227173,0.480050,0.252376,0.267573,H
377,Sunderland,Chelsea,2026-05-24,TBD,1547.099121,1902.801147,-355.702026,0.056142,0.097267,0.846591,A
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979858,-19.973145,0.450254,0.252211,0.297535,H


In [43]:
fixtures_with_elo.to_csv('output_data/predictions.csv', index=False)

### Simulate season (to-do)