# Soccer Forecasting Project

## Setup

### Libraries

In [1]:
# IMPORT PACKAGES
import pandas as pd
import os
from dotenv import load_dotenv
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, log_loss, accuracy_score
from datetime import date
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import TimeSeriesSplit
# from eval import time_based_kfold

In [2]:
load_dotenv()

True

In [3]:
from data_processor import filter_matches, train_test_split_pipeline

### Load Data

In [4]:
os.environ.get("SOCCER_DATA_PATH")

'/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data'

In [5]:
# Get data path from environment variable or use default
data_path = os.environ.get("SOCCER_DATA_PATH")

# Print the data path to debug
print(f"Data path: {data_path}")

# Use the correct path to load the files
matches = pd.read_csv(os.path.join(data_path, "Matches.csv"), low_memory=False)

Data path: /Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data


In [6]:
matches

Unnamed: 0,Division,MatchDate,MatchTime,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,...,MaxHome,MaxDraw,MaxAway,Over25,Under25,MaxOver25,MaxUnder25,HandiSize,HandiHome,HandiAway
0,F1,2000-07-28,,Marseille,Troyes,1686.34,1586.57,0.0,0.0,0.0,...,,,,,,,,,,
1,F1,2000-07-28,,Paris SG,Strasbourg,1714.89,1642.51,0.0,0.0,0.0,...,,,,,,,,,,
2,F2,2000-07-28,,Wasquehal,Nancy,1465.08,1633.80,0.0,0.0,0.0,...,,,,,,,,,,
3,F2,2000-07-29,,Ajaccio,Le Mans,1470.87,1477.89,0.0,0.0,0.0,...,,,,,,,,,,
4,F2,2000-07-29,,Beauvais,Montpellier,1422.21,1606.00,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228372,E0,2025-02-26,19:30:00,Brentford,Everton,1738.05,1731.52,6.0,9.0,5.0,...,2.06,3.82,3.67,1.85,1.99,1.89,2.04,-0.5,2.04,1.86
228373,E0,2025-02-26,19:30:00,Man United,Ipswich,1757.62,1584.51,1.0,4.0,1.0,...,1.56,4.75,6.50,1.64,2.29,1.69,2.34,-1.0,1.88,2.02
228374,E0,2025-02-26,19:30:00,Nott'm Forest,Arsenal,1788.28,1999.49,3.0,6.0,6.0,...,4.10,3.60,2.02,2.17,1.71,2.24,1.75,0.5,1.90,2.00
228375,E0,2025-02-26,19:30:00,Tottenham,Man City,1785.53,1926.48,9.0,9.0,3.0,...,3.35,4.25,2.10,1.34,3.29,1.37,3.40,0.5,1.84,2.06


In [7]:
# Our rows are going to be the matches. For each match, we have the following information:
# - Date of the match
# - Home Team
# - Away Team
# - Outcome of the match (H - Home Win, D - Draw, A - Away Win)

### Process and Filter Matches Data

In [8]:
matches.columns

Index(['Division', 'MatchDate', 'MatchTime', 'HomeTeam', 'AwayTeam', 'HomeElo',
       'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'FTHome',
       'FTAway', 'FTResult', 'HTHome', 'HTAway', 'HTResult', 'HomeShots',
       'AwayShots', 'HomeTarget', 'AwayTarget', 'HomeFouls', 'AwayFouls',
       'HomeCorners', 'AwayCorners', 'HomeYellow', 'AwayYellow', 'HomeRed',
       'AwayRed', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw',
       'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize',
       'HandiHome', 'HandiAway'],
      dtype='object')

In [9]:
matches.FTResult.value_counts()

FTResult
H    101928
A     65870
D     60576
Name: count, dtype: int64

In [10]:
matches = \
(matches.assign(
    date = pd.to_datetime(matches.MatchDate),
    day = pd.to_datetime(matches.MatchDate).dt.day,
    month = pd.to_datetime(matches.MatchDate).dt.month,
    year = pd.to_datetime(matches.MatchDate).dt.year
).sort_values(by='date', ascending=True)
)

In [11]:
# sorted(matches.Division.unique())

In [12]:
matches_rel_cols = ['Division', 'date', 'day', 'month', 'year', 'HomeElo', 'AwayElo', 'AwayTeam', 'FTResult']

filter_start_date, filter_end_date = '2020-08-01', '2024-07-31'
divisions_list = ['E0', 'F1', 'D1', 'I1', 'SP1']

filtered_matches = filter_matches(matches, matches_rel_cols, filter_start_date, filter_end_date, divisions_list)

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult
172856,I1,2020-08-01,1,8,2020,1761.08,1738.50,Lazio,H
172855,I1,2020-08-01,1,8,2020,1761.18,1588.63,Cagliari,H
172854,I1,2020-08-01,1,8,2020,1862.93,1739.94,Roma,A
172853,I1,2020-08-01,1,8,2020,1847.24,1804.70,Inter,A
172852,I1,2020-08-01,1,8,2020,1452.35,1592.28,Sampdoria,D
...,...,...,...,...,...,...,...,...,...
221372,I1,2024-05-26,26,5,2024,1605.42,1658.33,Udinese,A
221330,SP1,2024-05-26,26,5,2024,1650.86,1634.44,Mallorca,A
221331,SP1,2024-05-26,26,5,2024,1635.81,1673.65,Valencia,D
221332,SP1,2024-05-26,26,5,2024,1582.27,1636.37,Alaves,D


### Feature Generation

#### 1) Match Features (teams + match date)

These are features that are associated with a specific team or pair of teams on a specific date

##### Elo Difference
The elo difference between two teams on the day the match is played is a Match Feature since it is determined by both the identity of the teams playing as well as the date on which the match is played

In [13]:
filtered_matches['EloDiff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult,EloDiff
172856,I1,2020-08-01,1,8,2020,1761.08,1738.50,Lazio,H,22.58
172855,I1,2020-08-01,1,8,2020,1761.18,1588.63,Cagliari,H,172.55
172854,I1,2020-08-01,1,8,2020,1862.93,1739.94,Roma,A,122.99
172853,I1,2020-08-01,1,8,2020,1847.24,1804.70,Inter,A,42.54
172852,I1,2020-08-01,1,8,2020,1452.35,1592.28,Sampdoria,D,-139.93
...,...,...,...,...,...,...,...,...,...,...
221372,I1,2024-05-26,26,5,2024,1605.42,1658.33,Udinese,A,-52.91
221330,SP1,2024-05-26,26,5,2024,1650.86,1634.44,Mallorca,A,16.42
221331,SP1,2024-05-26,26,5,2024,1635.81,1673.65,Valencia,D,-37.84
221332,SP1,2024-05-26,26,5,2024,1582.27,1636.37,Alaves,D,-54.10


In [14]:
filtered_matches[filtered_matches.EloDiff.isna()]

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult,EloDiff


##### Form

Recent results may be more relevant for prediction.

In [15]:
# TO IMPLEMENT

#### 2) Team+Season Features

Features in this category will look like this

Team name / Season / Feature value

In [16]:
# def team_season_feature_generator(team_name, season, feature_name, feature_value):


##### Team Wage Bill (start of season)

The start-of-season wage bills are a Team + Season feature since they are determined by a) the identity of the two teams playing b) the season the match is played during

##### Team Transfer Market Value (start of season)

The start-of-season team market values are a Team + Season feature since they are determined by a) the identity of the two teams playing b) the season the match is played during

## Modeling

**General**

We have a dataset of historical soccer matches. Each row is a single match and the columns contain the name of the home team, the name of the away team and the final result of the match (H for Home Win, D for Draw, A for Away Win).

We aim to use this historical data, as well as a set of features we think might be informative, to predict match outcomes for the upcoming 25/26 season.

**Current**

We have the Elo rating (add reference) for each team at the moment the games were played. We also have the current Elo rating for the teams whose match outcomes we hope to predict.

Therefore, the features we can use in the model are Home Elo, Away Elo and EloDiff, as well as any desired transformations of these.

### Baseline

#### 1. Heuristic baselines

In [17]:
def generate_heuristic_predictions(X_test, condition):
    # Apply the specified heuristic condition
    X_test['predicted_outcome'] = X_test.apply(lambda row: condition(row), axis=1)
    return X_test['predicted_outcome']

In [18]:
def heuristic_function_generator(win_threshold=float('-inf'), loss_threshold=float('-inf')):
    return lambda row: "H" if row['EloDiff'] > win_threshold else "A" if row['EloDiff'] < loss_threshold else "D"

#### 2. Rating Systems

In [19]:
# TO IMPLEMENT

### Non-ordinal

Technically, modeling soccer match outcomes is an ordinal regression problem i.e. the outcomes have an inherent ordering.

Treating this task as an ordinal regression problem instead of a multiclass classification problem seems more theoretically appropriate and enhances interpretability. However, only a full training and evaluation pipeline for both classes of methods can tell us whether it actually improves prediction performance.

In [20]:
# features = ['HomeElo', 'AwayElo', 'EloDiff']
features = ['EloDiff']

In [21]:
def generate_model_predictions(X_train, X_test, y_train, features, model):
    X_train, X_test = X_train[features], X_test[features]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predicted_probabilities = model.predict_proba(X_test)
    return y_pred, predicted_probabilities

### Ordinal

In [22]:
# TO IMPLEMENT

In [23]:
# # Create outcome encoding (A=0, D=1, H=2)
# outcome_mapping = {'A': 0, 'D': 1, 'H': 2}
# filtered_matches['outcome_enc'] = filtered_matches['FTResult'].map(outcome_mapping)

# # Calculate Elo difference (Home - Away)
# filtered_matches['elo_diff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

# # Create and fit the ordinal logistic regression model
# model = OrderedModel(
#     filtered_matches['outcome_enc'],
#     filtered_matches[['elo_diff']], # Removed sm.add_constant()
#     distr='logit'
# )

# result = model.fit(method='bfgs')
# print(result.summary())

In [24]:
# filtered_matches_with_predictions = filtered_matches.copy()

# # Get predicted probabilities for each outcome
# predicted_probs = model.predict(result.params, exog=filtered_matches[['elo_diff']])

# # Add probabilities to the dataframe
# filtered_matches_with_predictions['prob_away'] = predicted_probs[:, 0]  # Probability of Away win
# filtered_matches_with_predictions['prob_draw'] = predicted_probs[:, 1]  # Probability of Draw
# filtered_matches_with_predictions['prob_home'] = predicted_probs[:, 2]  # Probability of Home win

# # Display example predictions with actual results
# results_df = filtered_matches_with_predictions[['date', 'HomeElo', 'AwayElo', 'elo_diff', 'FTResult', 
#                              'prob_away', 'prob_draw', 'prob_home']].round(3)
# print(results_df.head())

# # Optional: Verify that probabilities sum to 1
# print("\nVerifying probabilities sum to 1:")
# print(results_df[['prob_away', 'prob_draw', 'prob_home']].sum(axis=1).head())

## Evaluation

### Standard

In [25]:
# model.predict_proba(fixtures_with_elo[features])

In [26]:
def probability_predictions_to_dataframe(predicted_probabilities):
    """
    predicted_probabilities: predicted probabilities for each class. 
    This is a 2D 3xn Numpy array with the predicted probabilities for each class in the order A, D, H
    
    returns: a Pandas DataFrame with the predicted probabilities for each class in the order H, D, A as columns
    """
    return pd.DataFrame(predicted_probabilities, columns=['A', 'D', 'H'])[['H', 'D', 'A']]

In [27]:
def ranked_probability_score(y_test, predicted_probabilities, return_per_row=False):
    """
    y_test: true labels.
    This is a Pandas Series with the true labels
    
    predicted_probabilities: predicted probabilities for each class. 
    This is a 2D 3xn Numpy array with the predicted probabilities for each class in the order A, D, H
    
    returns: ranked probability score
    """

    prob_df = probability_predictions_to_dataframe(predicted_probabilities)

    # Cumulative predicted up to the first two cut points (K-1 = 2 for H/D/A)
    c1 = prob_df['H'].to_numpy()
    c2 = (prob_df['H'] + prob_df['D']).to_numpy()

    # Cumulative observed at the same cut points
    y = y_test.to_numpy()
    o1 = (y == 'H').astype(float)
    o2 = np.isin(y, ['H', 'D']).astype(float)

    # RPS per row and mean RPS
    rps_per_row = ((c1 - o1)**2 + (c2 - o2)**2) / 2.0
    mean_rps = float(rps_per_row.mean())

    return (mean_rps, rps_per_row) if return_per_row else mean_rps

In [28]:
y_test_sample = pd.Series(['H', 'D', 'A'])
predicted_probabilities_sample = np.array([[0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])

ranked_probability_score(y_test_sample, predicted_probabilities_sample)

0.0

In [29]:
def evaluator(y_test, y_pred, predicted_probabilities, output_dict = True):
    """
    Evaluates the performance of a model by calculating the classification report and confusion matrix
    
    y_test: true labels
    y_pred: predicted labels
    output_dict: if True, returns a dictionary with the classification report and confusion matrix
    else returns a string with the classification report
    """
    eval_dict = {}
    eval_dict['classification_report'] = classification_report(y_test, y_pred, labels=['H','D','A'], zero_division = 0, output_dict=output_dict)
    eval_dict['confusion_matrix'] = confusion_matrix(y_test, y_pred, labels=['H','D','A'])
    if predicted_probabilities is not None:
        eval_dict['rps'] = ranked_probability_score(y_test, predicted_probabilities)
    return eval_dict

In [30]:
def full_pipeline(filtered_matches, prediction_type, prediction_params, features, target = 'FTResult', output_dict = True):
    X_train, X_val, X_test, y_train, y_val, y_test = train_test_split_pipeline(filtered_matches, target)
    
    if prediction_type == 'model':
        model = prediction_params['model']
        y_pred, predicted_probabilities = generate_model_predictions(X_train, X_test, y_train, features, model)
        eval_dict = evaluator(y_test, y_pred, predicted_probabilities, output_dict=output_dict)
        return eval_dict, predicted_probabilities
    
    elif prediction_type == 'heuristic':
        condition = prediction_params['condition']
        y_pred = generate_heuristic_predictions(X_test, condition)
        eval_dict = evaluator(y_test, y_pred, predicted_probabilities=None, output_dict=output_dict)
        return eval_dict, None
    else:
        raise ValueError(f"Invalid prediction type: {prediction_type}")

#### Heuristic Methods Evaluation

In [31]:
conditions_dict = {
    'condition_1': heuristic_function_generator(),
    'condition_2': heuristic_function_generator(win_threshold=0),
    'condition_3': heuristic_function_generator(win_threshold=50, loss_threshold=-50),
    'condition_4': heuristic_function_generator(win_threshold=100, loss_threshold=-100),
    'condition_5': heuristic_function_generator(win_threshold=0, loss_threshold=-100),
    'condition_6': heuristic_function_generator(win_threshold=-50, loss_threshold=-100),
    'condition_7': heuristic_function_generator(win_threshold=25, loss_threshold=-25),
    'condition_8': heuristic_function_generator(win_threshold=25, loss_threshold=-50)
}

In [32]:
# Print accuracy
for condition in conditions_dict.values():
    print(full_pipeline(filtered_matches, 'heuristic', {'condition': condition}, features)[0]['classification_report']['accuracy'])

0.4281767955801105
0.43370165745856354
0.507366482504604
0.4631675874769797
0.5138121546961326
0.5267034990791897
0.5276243093922652
0.5202578268876611


In [33]:
# Detailed evaluation of each heuristic
for condition in conditions_dict.values():
    print(full_pipeline(filtered_matches, 'heuristic', {'condition': condition}, features)[0]['classification_report'])

{'H': {'precision': 0.4281767955801105, 'recall': 1.0, 'f1-score': 0.5996131528046421, 'support': 465.0}, 'D': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 277.0}, 'A': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 344.0}, 'accuracy': 0.4281767955801105, 'macro avg': {'precision': 0.1427255985267035, 'recall': 0.3333333333333333, 'f1-score': 0.1998710509348807, 'support': 1086.0}, 'weighted avg': {'precision': 0.18333536827325175, 'recall': 0.4281767955801105, 'f1-score': 0.2567404383555788, 'support': 1086.0}}
{'H': {'precision': 0.5988805970149254, 'recall': 0.6903225806451613, 'f1-score': 0.6413586413586414, 'support': 465.0}, 'D': {'precision': 0.2727272727272727, 'recall': 0.5415162454873647, 'f1-score': 0.36275695284159615, 'support': 277.0}, 'A': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 344.0}, 'accuracy': 0.43370165745856354, 'macro avg': {'precision': 0.29053595658073267, 'recall': 0.4106129420441753, 'f1-score': 0.33470519

#### ML Methods Evaluation

In [34]:
def compare_models(filtered_matches, features):
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
        'SVM': SVC(probability=True, random_state=42, class_weight='balanced'),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'XGB': XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.1,random_state=42,eval_metric='mlogloss'),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(50,), random_state=42),
        'NB': GaussianNB(),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'CatBoost': CatBoostClassifier(
            loss_function='MultiClass',
            eval_metric='TotalF1',        # or 'MultiClass'
            class_names=['H','D','A'],    # keeps string labels consistent
            auto_class_weights='Balanced',
            random_seed=42,
            verbose=False
        ),
    }
    
    results = {}
    for name, model in models.items():
        try:
            eval_dict, _ = full_pipeline(filtered_matches, 'model', {'model': model}, features)
            # Extract accuracy from classification report
            report = eval_dict['classification_report']
            # Parse accuracy from the report
            accuracy = report['accuracy']
            rps = eval_dict['rps']
            results[name] = {}
            results[name]['Accuracy'] = accuracy
            results[name]['RPS'] = rps
            results[name]['Classification Report'] = report
        except Exception as e:
            results[name] = {}
            results[name]['Error'] = e
    
    return pd.DataFrame(results) 

# Run comparison
model_comparison = compare_models(filtered_matches, features)

In [35]:
model_comparison

Unnamed: 0,Logistic Regression,Random Forest,SVM,Gradient Boosting,XGB,Neural Network,NB,KNN,Decision Tree,CatBoost
Accuracy,0.541436,0.432781,0.502762,0.54512,,0.539595,0.543278,0.471455,0.43186,0.482505
RPS,0.194919,0.276714,0.197912,0.19718,,0.197708,0.195038,0.224199,0.385602,0.324275
Classification Report,"{'H': {'precision': 0.5524781341107872, 'recal...","{'H': {'precision': 0.5221238938053098, 'recal...","{'H': {'precision': 0.6354679802955665, 'recal...","{'H': {'precision': 0.5604229607250756, 'recal...",,"{'H': {'precision': 0.5472779369627507, 'recal...","{'H': {'precision': 0.554904831625183, 'recall...","{'H': {'precision': 0.5796766743648961, 'recal...","{'H': {'precision': 0.5211581291759465, 'recal...","{'H': {'precision': 0.6377551020408163, 'recal..."
Error,,,,,Invalid classes inferred from unique values of...,,,,,


After evaluating many different models, I look at the detailed evaluation metrics for the models that had the highest accuracy.

In [36]:
model_comparison['Logistic Regression']['Classification Report']

{'H': {'precision': 0.5524781341107872,
  'recall': 0.8150537634408602,
  'f1-score': 0.6585577758470895,
  'support': 465.0},
 'D': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 277.0},
 'A': {'precision': 0.5225,
  'recall': 0.6075581395348837,
  'f1-score': 0.5618279569892473,
  'support': 344.0},
 'accuracy': 0.5414364640883977,
 'macro avg': {'precision': 0.3583260447035957,
  'recall': 0.47420396765858125,
  'f1-score': 0.40679524427877894,
  'support': 1086.0},
 'weighted avg': {'precision': 0.40206476276382686,
  'recall': 0.5414364640883977,
  'f1-score': 0.4599430782441968,
  'support': 1086.0}}

In [37]:
model_comparison['Gradient Boosting']['Classification Report']

{'H': {'precision': 0.5604229607250756,
  'recall': 0.7978494623655914,
  'f1-score': 0.6583850931677019,
  'support': 465.0},
 'D': {'precision': 0.5384615384615384,
  'recall': 0.02527075812274368,
  'f1-score': 0.04827586206896552,
  'support': 277.0},
 'A': {'precision': 0.5206812652068127,
  'recall': 0.622093023255814,
  'f1-score': 0.5668874172185431,
  'support': 344.0},
 'accuracy': 0.5451197053406999,
 'macro avg': {'precision': 0.5398552547978089,
  'recall': 0.4817377479147164,
  'f1-score': 0.42451612415173684,
  'support': 1086.0},
 'weighted avg': {'precision': 0.5422328527828267,
  'recall': 0.5451197053406999,
  'f1-score': 0.4737852243455466,
  'support': 1086.0}}

### K-Fold Cross Validation

In [38]:
# TO IMPLEMENT

### Nested Cross Validation

In [39]:
# TO IMPLEMENT

### Time-based cross validation

In [40]:
# TO IMPLEMENT

In [41]:
# # Sort data chronologically
# filtered_matches = filtered_matches.sort_values('date')

# # First, create a true held-out test set (e.g., last 20% of the data)
# test_size = int(len(filtered_matches) * 0.2)
# train_val_data = filtered_matches.iloc[:-test_size].copy()
# test_data = filtered_matches.iloc[-test_size:].copy()

# print(f"Full dataset size: {len(filtered_matches)}")
# print(f"Train+Validation set size: {len(train_val_data)}")
# print(f"Test set size: {len(test_data)}")
# print(f"Test set period: {test_data.date.min()} to {test_data.date.max()}")

## Output

### Load Elo data

In [42]:
from club_elo_api import get_daily_ranking

ranking = get_daily_ranking(date(2025, 7, 29))
if ranking is None:
    print(f"Failed to fetch daily ranking")
    ranking = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/elo_2025_07_29.csv')

In [43]:
ranking

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1.0,Liverpool,ENG,1,1993.417725,2025-05-29,2025-08-15
1,2.0,Arsenal,ENG,1,1993.340332,2025-05-29,2025-08-17
2,3.0,Paris SG,FRA,1,1974.937134,2025-06-01,2025-08-06
3,4.0,Man City,ENG,1,1959.944214,2025-06-01,2025-08-16
4,5.0,Barcelona,ESP,1,1945.430664,2025-06-01,2025-08-16
...,...,...,...,...,...,...,...
625,,St Josephs,GIB,0,943.413940,2025-07-25,2025-07-29
626,,FCB Magpies,GIB,0,909.725769,2025-07-25,2025-07-29
627,,Tre Fiori,SMR,0,707.442810,2025-07-18,2025-08-07
628,,SS Virtus,SMR,0,706.482422,2025-07-18,2025-08-07


### Load fixtures and pre-process

In [44]:
fixtures = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/epl_fixtures.csv')

fixtures

Unnamed: 0,home_team,away_team,date,time
0,Liverpool,AFC Bournemouth,2025-08-15,20:00
1,Aston Villa,Newcastle United,2025-08-16,15:00
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00
3,Sunderland,West Ham United,2025-08-16,15:00
4,Tottenham Hotspur,Burnley,2025-08-16,15:00
...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD
377,Sunderland,Chelsea,2026-05-24,TBD
378,Tottenham Hotspur,Everton,2026-05-24,TBD


In [45]:
ranking_epl_teams = ranking[(ranking['Country'] == 'ENG') & (ranking['Level'] == 1)]['Club']

missing_teams_fixtures = set(fixtures.home_team.unique()) - set(ranking_epl_teams)
print(sorted(missing_teams_fixtures))

missing_teams_elo = set(set(ranking_epl_teams) - set(fixtures.home_team.unique()))
print(sorted(missing_teams_elo))

['AFC Bournemouth', 'Brighton & Hove Albion', 'Leeds United', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Tottenham Hotspur', 'West Ham United']
['Bournemouth', 'Forest', 'Leeds', 'Man City', 'Man United', 'Newcastle', 'Tottenham', 'West Ham']


In [46]:
name_differences_mapping = {
    'Tottenham': 'Tottenham Hotspur',
    'Forest': 'Nottingham Forest',
    'Man United': 'Manchester United',
    'Leeds': 'Leeds United',
    'West Ham': 'West Ham United',
    'Man City': 'Manchester City',
    'Bournemouth': 'AFC Bournemouth',
    'Newcastle': 'Newcastle United',
    'Brighton': 'Brighton & Hove Albion'
}

In [47]:
ranking_copy = ranking.copy()

ranking_copy['Club'] = ranking_copy['Club'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['home_team'] = fixtures['home_team'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['away_team'] = fixtures['away_team'].apply(lambda x: name_differences_mapping.get(x, x))

### Merge fixtures with features for prediction

In [48]:
fixtures_with_elo = (
    fixtures
    .merge(ranking_copy[['Club', 'Elo']], left_on='home_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'HomeElo'})
    .drop(columns=['Club'])
    .merge(ranking_copy[['Club', 'Elo']], left_on='away_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'AwayElo'})
    .drop(columns=['Club'])
)

fixtures_with_elo['EloDiff'] = fixtures_with_elo['HomeElo'] - fixtures_with_elo['AwayElo']

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417725,1808.095215,185.322510
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552734,4.286499
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447
3,Sunderland,West Ham United,2025-08-16,15:00,1547.098999,1750.132446,-203.033447
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058
...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944214,1872.839233,87.104980
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095215,-5.227173
377,Sunderland,Chelsea,2026-05-24,TBD,1547.098999,1902.801025,-355.702026
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979736,-19.973022


In [49]:
print(fixtures_with_elo.HomeElo.isna().sum())
print(fixtures_with_elo.AwayElo.isna().sum())

0
0


### Generate match predictions

In [50]:
# model.predict_proba(fixtures_with_elo[['HomeElo', 'AwayElo']])
# model.predict(fixtures_with_elo[['HomeElo', 'AwayElo']])

In [51]:
final_prediction_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
final_prediction_model = LogisticRegression(multi_class='multinomial',solver='lbfgs',penalty='l2',C=1.0,max_iter=200)

# Train on ALL available data for maximum performance
X_all = filtered_matches[features]
y_all = filtered_matches['FTResult']

final_prediction_model.fit(X_all, y_all)



In [52]:
# Get predictions with labeled columns and reorder to Home, Draw, Away
prob_df = pd.DataFrame(final_prediction_model.predict_proba(fixtures_with_elo[features]), columns=['A', 'D', 'H'])[['H', 'D', 'A']]

# Assign probabilities to fixtures_with_elo
fixtures_with_elo['home_win_prob'], fixtures_with_elo['draw_prob'], fixtures_with_elo['away_win_prob'] = prob_df['H'], prob_df['D'], prob_df['A']

fixtures_with_elo['predicted_outcome'] = final_prediction_model.predict(fixtures_with_elo[features])

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff,home_win_prob,draw_prob,away_win_prob,predicted_outcome
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417725,1808.095215,185.322510,0.658751,0.220535,0.120713,H
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552734,4.286499,0.429639,0.279193,0.291168,H
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447,0.484855,0.271010,0.244135,H
3,Sunderland,West Ham United,2025-08-16,15:00,1547.098999,1750.132446,-203.033447,0.184523,0.256282,0.559195,A
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058,0.483519,0.271254,0.245227,H
...,...,...,...,...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944214,1872.839233,87.104980,0.539862,0.259008,0.201130,H
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095215,-5.227173,0.416875,0.280507,0.302618,H
377,Sunderland,Chelsea,2026-05-24,TBD,1547.098999,1902.801025,-355.702026,0.079613,0.193447,0.726940,A
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979736,-19.973022,0.397179,0.282090,0.320731,H


In [53]:
fixtures_with_elo.to_csv('output_data/predictions.csv', index=False)

### Simulate season (to-do)