# Soccer Forecasting Project

## Setup

### Libraries

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datetime import date

In [2]:
load_dotenv()

True

### Load Data

In [3]:
os.environ.get("SOCCER_DATA_PATH")

'/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data'

In [4]:
# Get data path from environment variable or use default
data_path = os.environ.get("SOCCER_DATA_PATH")

# Print the data path to debug
print(f"Data path: {data_path}")

# Use the correct path to load the files
elo_data = pd.read_csv(os.path.join(data_path, "EloRatings.csv"))
matches = pd.read_csv(os.path.join(data_path, "Matches.csv"), low_memory=False)

Data path: /Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data


In [5]:
# Our rows are going to be the matches. For each match, we have the following information:
# - Date of the match
# - Home Team
# - Away Team
# - Home Team Goals
# - Away Team Goals
# - Home Team Elo Rating on the most recent date before the match
# - Away Team Elo Rating on the most recent date before the match``

### Matches

In [6]:
matches.columns

Index(['Division', 'MatchDate', 'MatchTime', 'HomeTeam', 'AwayTeam', 'HomeElo',
       'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'FTHome',
       'FTAway', 'FTResult', 'HTHome', 'HTAway', 'HTResult', 'HomeShots',
       'AwayShots', 'HomeTarget', 'AwayTarget', 'HomeFouls', 'AwayFouls',
       'HomeCorners', 'AwayCorners', 'HomeYellow', 'AwayYellow', 'HomeRed',
       'AwayRed', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw',
       'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize',
       'HandiHome', 'HandiAway'],
      dtype='object')

In [7]:
matches.FTResult.value_counts()

FTResult
H    101928
A     65870
D     60576
Name: count, dtype: int64

In [8]:
matches = \
(matches.assign(
    date = pd.to_datetime(matches.MatchDate),
    day = pd.to_datetime(matches.MatchDate).dt.day,
    month = pd.to_datetime(matches.MatchDate).dt.month,
    year = pd.to_datetime(matches.MatchDate).dt.year
).sort_values(by='date', ascending=True)
)

In [9]:
def filter_matches(matches, matches_rel_cols, filter_start_date, filter_end_date, divisions_list):
    matches_col_filtered = matches[matches_rel_cols]
    # Apply both filters using logical AND (&)
    filtered_matches = matches_col_filtered[
        (matches_col_filtered.date >= filter_start_date) & (matches_col_filtered.date <= filter_end_date) & 
        (matches_col_filtered.Division.isin(divisions_list))
    ]

    return filtered_matches

In [10]:
matches_rel_cols = ['Division', 'date', 'day', 'month', 'year', 'HomeElo', 'AwayElo', 'AwayTeam', 'FTResult']

filter_start_date = '2023-08-01'
filter_end_date = '2024-07-31'
divisions_list = ['E0']

filtered_matches = filter_matches(matches, matches_rel_cols, filter_start_date, filter_end_date, divisions_list)

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult
210362,E0,2023-08-11,11,8,2023,1726.42,2077.27,Man City,A
210454,E0,2023-08-12,12,8,2023,1876.02,1828.01,Aston Villa,H
210378,E0,2023-08-12,12,8,2023,1644.08,1757.41,Crystal Palace,A
210377,E0,2023-08-12,12,8,2023,1708.25,1736.65,Fulham,A
210376,E0,2023-08-12,12,8,2023,1828.20,1606.54,Luton,H
...,...,...,...,...,...,...,...,...,...
221144,E0,2024-05-19,19,5,2024,1798.52,1695.08,Bournemouth,H
221142,E0,2024-05-19,19,5,2024,1732.68,1759.74,Man United,A
221141,E0,2024-05-19,19,5,2024,1721.75,1798.56,Newcastle,A
221140,E0,2024-05-19,19,5,2024,1944.71,1709.04,Everton,H


In [11]:
filtered_matches['EloDiff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult,EloDiff
210362,E0,2023-08-11,11,8,2023,1726.42,2077.27,Man City,A,-350.85
210454,E0,2023-08-12,12,8,2023,1876.02,1828.01,Aston Villa,H,48.01
210378,E0,2023-08-12,12,8,2023,1644.08,1757.41,Crystal Palace,A,-113.33
210377,E0,2023-08-12,12,8,2023,1708.25,1736.65,Fulham,A,-28.40
210376,E0,2023-08-12,12,8,2023,1828.20,1606.54,Luton,H,221.66
...,...,...,...,...,...,...,...,...,...,...
221144,E0,2024-05-19,19,5,2024,1798.52,1695.08,Bournemouth,H,103.44
221142,E0,2024-05-19,19,5,2024,1732.68,1759.74,Man United,A,-27.06
221141,E0,2024-05-19,19,5,2024,1721.75,1798.56,Newcastle,A,-76.81
221140,E0,2024-05-19,19,5,2024,1944.71,1709.04,Everton,H,235.67


## Modeling

### Problem Formulation here (to-do)

### Introduction - describe model(s) considered here (to-do)

### Non-ordinal methods

In [12]:
def train_test_split_pipeline(filtered_matches, features, target):

    X = filtered_matches[features]
    y = filtered_matches[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)
    
    return X_train, X_test, y_train, y_test

In [13]:
# features = ['HomeElo', 'AwayElo', 'EloDiff']
features = ['EloDiff']
target = 'FTResult'

X_train, X_test, y_train, y_test = train_test_split_pipeline(filtered_matches, features, target)

In [14]:
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,
    max_iter=200
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)




In [15]:
# Predicted probabilities of each class
print(model.predict_proba(X_test)[:5])

[[0.11397681 0.2253885  0.66063469]
 [0.83286746 0.10510992 0.06202262]
 [0.01009343 0.11063025 0.87927632]
 [0.65711074 0.17840662 0.16448264]
 [0.68077909 0.16997612 0.14924479]]


In [16]:
# Actual Results sample
print(list(y_test[:20]))

# Predicted Results sample
print(list(y_pred[:20]))

['H', 'A', 'H', 'A', 'A', 'A', 'H', 'H', 'A', 'H', 'A', 'A', 'A', 'H', 'H', 'D', 'H', 'A', 'A', 'D']
['H', 'A', 'H', 'A', 'A', 'H', 'H', 'H', 'H', 'H', 'A', 'A', 'A', 'A', 'H', 'A', 'H', 'H', 'A', 'H']


In [17]:
print(classification_report(y_test, y_pred, labels=['H','D','A'], zero_division = 0))

              precision    recall  f1-score   support

           H       0.59      0.86      0.70        35
           D       0.00      0.00      0.00        16
           A       0.56      0.56      0.56        25

    accuracy                           0.58        76
   macro avg       0.38      0.47      0.42        76
weighted avg       0.46      0.58      0.51        76



#### Ordinal methods (to-do)

In [18]:
# # Create outcome encoding (A=0, D=1, H=2)
# outcome_mapping = {'A': 0, 'D': 1, 'H': 2}
# filtered_matches['outcome_enc'] = filtered_matches['FTResult'].map(outcome_mapping)

# # Calculate Elo difference (Home - Away)
# filtered_matches['elo_diff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

# # Create and fit the ordinal logistic regression model
# model = OrderedModel(
#     filtered_matches['outcome_enc'],
#     filtered_matches[['elo_diff']], # Removed sm.add_constant()
#     distr='logit'
# )

# result = model.fit(method='bfgs')
# print(result.summary())

In [19]:
# filtered_matches_with_predictions = filtered_matches.copy()

# # Get predicted probabilities for each outcome
# predicted_probs = model.predict(result.params, exog=filtered_matches[['elo_diff']])

# # Add probabilities to the dataframe
# filtered_matches_with_predictions['prob_away'] = predicted_probs[:, 0]  # Probability of Away win
# filtered_matches_with_predictions['prob_draw'] = predicted_probs[:, 1]  # Probability of Draw
# filtered_matches_with_predictions['prob_home'] = predicted_probs[:, 2]  # Probability of Home win

# # Display example predictions with actual results
# results_df = filtered_matches_with_predictions[['date', 'HomeElo', 'AwayElo', 'elo_diff', 'FTResult', 
#                              'prob_away', 'prob_draw', 'prob_home']].round(3)
# print(results_df.head())

# # Optional: Verify that probabilities sum to 1
# print("\nVerifying probabilities sum to 1:")
# print(results_df[['prob_away', 'prob_draw', 'prob_home']].sum(axis=1).head())

We need to do a training, validation and testing split. 
Chronological split
Explain why.

## Evaluation (to-do)

In [20]:
# from eval import time_based_kfold
# from sklearn.metrics import log_loss, accuracy_score
# import numpy as np

In [21]:
# # Sort data chronologically
# filtered_matches = filtered_matches.sort_values('date')

# # First, create a true held-out test set (e.g., last 20% of the data)
# test_size = int(len(filtered_matches) * 0.2)
# train_val_data = filtered_matches.iloc[:-test_size].copy()
# test_data = filtered_matches.iloc[-test_size:].copy()

# print(f"Full dataset size: {len(filtered_matches)}")
# print(f"Train+Validation set size: {len(train_val_data)}")
# print(f"Test set size: {len(test_data)}")
# print(f"Test set period: {test_data.date.min()} to {test_data.date.max()}")

### K-Fold Cross Validation

### Nested Cross Validation

## Output

### Load Elo data

In [22]:
from club_elo_api import get_daily_ranking

ranking = get_daily_ranking(date(2025, 7, 29))
if ranking is None:
    print(f"Failed to fetch daily ranking")
    ranking = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/elo_2025_07_29.csv')

In [23]:
ranking

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1.0,Liverpool,ENG,1,1993.417847,2025-05-29,2025-08-15
1,2.0,Arsenal,ENG,1,1993.340332,2025-05-29,2025-08-17
2,3.0,Paris SG,FRA,1,1974.937012,2025-06-01,2025-08-17
3,4.0,Man City,ENG,1,1959.944092,2025-06-01,2025-08-16
4,5.0,Barcelona,ESP,1,1945.430786,2025-06-01,2025-08-16
...,...,...,...,...,...,...,...
620,,St Josephs,GIB,0,943.413879,2025-07-25,2025-07-29
621,,FCB Magpies,GIB,0,909.725769,2025-07-25,2025-07-29
622,,Tre Fiori,SMR,0,707.442810,2025-07-18,2025-08-25
623,,SS Virtus,SMR,0,706.482422,2025-07-18,2025-08-07


### Load fixtures

In [24]:
fixtures = pd.read_csv('/Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/data/epl_fixtures.csv')

fixtures

Unnamed: 0,home_team,away_team,date,time
0,Liverpool,AFC Bournemouth,2025-08-15,20:00
1,Aston Villa,Newcastle United,2025-08-16,15:00
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00
3,Sunderland,West Ham United,2025-08-16,15:00
4,Tottenham Hotspur,Burnley,2025-08-16,15:00
...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD
377,Sunderland,Chelsea,2026-05-24,TBD
378,Tottenham Hotspur,Everton,2026-05-24,TBD


### Pre-processing

In [25]:
ranking_epl_teams = ranking[(ranking['Country'] == 'ENG') & (ranking['Level'] == 1)]['Club']

missing_teams_fixtures = set(fixtures.home_team.unique()) - set(ranking_epl_teams)
print(sorted(missing_teams_fixtures))

missing_teams_elo = set(set(ranking_epl_teams) - set(fixtures.home_team.unique()))
print(sorted(missing_teams_elo))

['AFC Bournemouth', 'Brighton & Hove Albion', 'Leeds United', 'Manchester City', 'Manchester United', 'Newcastle United', 'Nottingham Forest', 'Tottenham Hotspur', 'West Ham United']
['Bournemouth', 'Forest', 'Leeds', 'Man City', 'Man United', 'Newcastle', 'Tottenham', 'West Ham']


In [26]:
name_differences_mapping = {
    'Tottenham': 'Tottenham Hotspur',
    'Forest': 'Nottingham Forest',
    'Man United': 'Manchester United',
    'Leeds': 'Leeds United',
    'West Ham': 'West Ham United',
    'Man City': 'Manchester City',
    'Bournemouth': 'AFC Bournemouth',
    'Newcastle': 'Newcastle United',
    'Brighton': 'Brighton & Hove Albion'
}

In [27]:
ranking_copy = ranking.copy()

ranking_copy['Club'] = ranking_copy['Club'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['home_team'] = fixtures['home_team'].apply(lambda x: name_differences_mapping.get(x, x))
fixtures['away_team'] = fixtures['away_team'].apply(lambda x: name_differences_mapping.get(x, x))

### Merge fixtures with Elo

In [28]:
fixtures_with_elo = (
    fixtures
    .merge(ranking_copy[['Club', 'Elo']], left_on='home_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'HomeElo'})
    .drop(columns=['Club'])
    .merge(ranking_copy[['Club', 'Elo']], left_on='away_team', right_on='Club', how='left')
    .rename(columns={'Elo': 'AwayElo'})
    .drop(columns=['Club'])
)

fixtures_with_elo['EloDiff'] = fixtures_with_elo['HomeElo'] - fixtures_with_elo['AwayElo']

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417847,1808.095337,185.322510
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552612,4.286621
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447
3,Sunderland,West Ham United,2025-08-16,15:00,1547.099121,1750.132446,-203.033325
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058
...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944092,1872.839233,87.104858
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095337,-5.227295
377,Sunderland,Chelsea,2026-05-24,TBD,1547.099121,1902.801147,-355.702026
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979980,-19.973267


In [29]:
print(fixtures_with_elo.HomeElo.isna().sum())
print(fixtures_with_elo.AwayElo.isna().sum())

0
0


### Generate match predictions

In [30]:
# model.predict_proba(fixtures_with_elo[['HomeElo', 'AwayElo']])
# model.predict(fixtures_with_elo[['HomeElo', 'AwayElo']])

In [31]:
# Get predictions with labeled columns and reorder to Home, Draw, Away
prob_df = pd.DataFrame(model.predict_proba(fixtures_with_elo[features]), columns=['A', 'D', 'H'])[['H', 'D', 'A']]

# Assign probabilities to fixtures_with_elo
fixtures_with_elo['home_win_prob'], fixtures_with_elo['draw_prob'], fixtures_with_elo['away_win_prob'] = prob_df['H'], prob_df['D'], prob_df['A']

fixtures_with_elo['predicted_outcome'] = model.predict(fixtures_with_elo[features])

fixtures_with_elo

Unnamed: 0,home_team,away_team,date,time,HomeElo,AwayElo,EloDiff,home_win_prob,draw_prob,away_win_prob,predicted_outcome
0,Liverpool,AFC Bournemouth,2025-08-15,20:00,1993.417847,1808.095337,185.322510,0.715337,0.206390,0.078273,H
1,Aston Villa,Newcastle United,2025-08-16,15:00,1872.839233,1868.552612,4.286621,0.483811,0.252186,0.264003,H
2,Brighton & Hove Albion,Fulham,2025-08-16,15:00,1827.129028,1781.720581,45.408447,0.545099,0.248412,0.206489,H
3,Sunderland,West Ham United,2025-08-16,15:00,1547.099121,1750.132446,-203.033325,0.183072,0.187856,0.629072,A
4,Tottenham Hotspur,Burnley,2025-08-16,15:00,1774.006714,1729.597656,44.409058,0.543657,0.248566,0.207778,H
...,...,...,...,...,...,...,...,...,...,...,...
375,Manchester City,Aston Villa,2026-05-24,TBD,1959.944092,1872.839233,87.104858,0.602657,0.239666,0.157677,H
376,Nottingham Forest,AFC Bournemouth,2026-05-24,TBD,1802.868042,1808.095337,-5.227295,0.469163,0.252271,0.278566,H
377,Sunderland,Chelsea,2026-05-24,TBD,1547.099121,1902.801147,-355.702026,0.062455,0.105532,0.832013,A
378,Tottenham Hotspur,Everton,2026-05-24,TBD,1774.006714,1793.979980,-19.973267,0.446225,0.251779,0.301995,H


In [33]:
fixtures_with_elo.to_csv('output_data/predictions.csv', index=False)

### Simulate season (to-do)