# Soccer Forecasting Project

## Setup

### Libraries

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv

### Load Data

In [3]:
# Get data path from environment variable or use default
data_path = os.environ.get("SOCCER_DATA_PATH", "./Match-Data")

# Print the data path to debug
print(f"Data path: {data_path}")

# Use the correct path to load the files
elo_data = pd.read_csv(os.path.join(data_path, "EloRatings.csv"))
matches = pd.read_csv(os.path.join(data_path, "matches.csv"), low_memory=False)

Data path: /Users/heshamnawaz/Desktop/Projects/soccer-forecasting-project/Match-Data


In [5]:
matches.FTResult.value_counts()

FTResult
H    101928
A     65870
D     60576
Name: count, dtype: int64

### Matches

In [17]:
matches.columns

Index(['Division', 'MatchDate', 'MatchTime', 'HomeTeam', 'AwayTeam', 'HomeElo',
       'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'FTHome',
       'FTAway', 'FTResult', 'HTHome', 'HTAway', 'HTResult', 'HomeShots',
       'AwayShots', 'HomeTarget', 'AwayTarget', 'HomeFouls', 'AwayFouls',
       'HomeCorners', 'AwayCorners', 'HomeYellow', 'AwayYellow', 'HomeRed',
       'AwayRed', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw',
       'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize',
       'HandiHome', 'HandiAway'],
      dtype='object')

In [18]:
matches = \
(matches.assign(
    date = pd.to_datetime(matches.MatchDate),
    day = pd.to_datetime(matches.MatchDate).dt.day,
    month = pd.to_datetime(matches.MatchDate).dt.month,
    year = pd.to_datetime(matches.MatchDate).dt.year
).sort_values(by='date', ascending=True)
)

matches_rel_cols = ['Division', 'date', 'day', 'month', 'year', 'HomeElo', 'AwayElo', 'AwayTeam', 'FTResult']
matches_col_filtered = matches[matches_rel_cols]
matches_col_filtered

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult
0,F1,2000-07-28,28,7,2000,1686.34,1586.57,Troyes,H
1,F1,2000-07-28,28,7,2000,1714.89,1642.51,Strasbourg,H
2,F2,2000-07-28,28,7,2000,1465.08,1633.80,Nancy,A
18,F1,2000-07-29,29,7,2000,1551.65,1656.37,Bastia,A
17,F1,2000-07-29,29,7,2000,1664.36,1730.89,Lens,A
...,...,...,...,...,...,...,...,...,...
228375,E0,2025-02-26,26,2,2025,1785.53,1926.48,Man City,A
228372,E0,2025-02-26,26,2,2025,1738.05,1731.52,Everton,D
228373,E0,2025-02-26,26,2,2025,1757.62,1584.51,Ipswich,H
228374,E0,2025-02-26,26,2,2025,1788.28,1999.49,Arsenal,D


In [19]:
filter_start_date = '2023-08-01'
filter_end_date = '2024-07-31'
divisions_list = ['E0']

# Apply both filters using logical AND (&)
filtered_matches = matches_col_filtered[
    (matches_col_filtered.date >= filter_start_date) & (matches_col_filtered.date <= filter_end_date) & 
    (matches_col_filtered.Division.isin(divisions_list))
]

filtered_matches

Unnamed: 0,Division,date,day,month,year,HomeElo,AwayElo,AwayTeam,FTResult
210362,E0,2023-08-11,11,8,2023,1726.42,2077.27,Man City,A
210454,E0,2023-08-12,12,8,2023,1876.02,1828.01,Aston Villa,H
210378,E0,2023-08-12,12,8,2023,1644.08,1757.41,Crystal Palace,A
210377,E0,2023-08-12,12,8,2023,1708.25,1736.65,Fulham,A
210376,E0,2023-08-12,12,8,2023,1828.20,1606.54,Luton,H
...,...,...,...,...,...,...,...,...,...
221144,E0,2024-05-19,19,5,2024,1798.52,1695.08,Bournemouth,H
221142,E0,2024-05-19,19,5,2024,1732.68,1759.74,Man United,A
221141,E0,2024-05-19,19,5,2024,1721.75,1798.56,Newcastle,A
221140,E0,2024-05-19,19,5,2024,1944.71,1709.04,Everton,H


In [21]:
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Create outcome encoding (A=0, D=1, H=2)
outcome_mapping = {'A': 0, 'D': 1, 'H': 2}
filtered_matches['outcome_enc'] = filtered_matches['FTResult'].map(outcome_mapping)

# Calculate Elo difference (Home - Away)
filtered_matches['elo_diff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']

# Create and fit the ordinal logistic regression model
model = OrderedModel(
    filtered_matches['outcome_enc'],
    filtered_matches[['elo_diff']], # Removed sm.add_constant()
    distr='logit'
)

result = model.fit(method='bfgs')
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.921934
         Iterations: 11
         Function evaluations: 15
         Gradient evaluations: 15
                             OrderedModel Results                             
Dep. Variable:            outcome_enc   Log-Likelihood:                -350.33
Model:                   OrderedModel   AIC:                             706.7
Method:            Maximum Likelihood   BIC:                             718.5
Date:                Sun, 13 Apr 2025                                         
Time:                        17:44:33                                         
No. Observations:                 380                                         
Df Residuals:                     377                                         
Df Model:                           1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_matches['outcome_enc'] = filtered_matches['FTResult'].map(outcome_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_matches['elo_diff'] = filtered_matches['HomeElo'] - filtered_matches['AwayElo']


In [22]:
# Get predicted probabilities for each outcome
predicted_probs = model.predict(result.params, exog=filtered_matches[['elo_diff']])

# Add probabilities to the dataframe
filtered_matches['prob_away'] = predicted_probs[:, 0]  # Probability of Away win
filtered_matches['prob_draw'] = predicted_probs[:, 1]  # Probability of Draw
filtered_matches['prob_home'] = predicted_probs[:, 2]  # Probability of Home win

# Display example predictions with actual results
results_df = filtered_matches[['date', 'HomeElo', 'AwayElo', 'elo_diff', 'FTResult', 
                             'prob_away', 'prob_draw', 'prob_home']].round(3)
print(results_df.head())

# Optional: Verify that probabilities sum to 1
print("\nVerifying probabilities sum to 1:")
print(results_df[['prob_away', 'prob_draw', 'prob_home']].sum(axis=1).head())

             date  HomeElo  AwayElo  elo_diff FTResult  prob_away  prob_draw  \
210362 2023-08-11  1726.42  2077.27   -350.85        A      0.803      0.123   
210454 2023-08-12  1876.02  1828.01     48.01        H      0.230      0.246   
210378 2023-08-12  1644.08  1757.41   -113.33        A      0.462      0.261   
210377 2023-08-12  1708.25  1736.65    -28.40        A      0.330      0.270   
210376 2023-08-12  1828.20  1606.54    221.66        H      0.087      0.138   

        prob_home  
210362      0.075  
210454      0.524  
210378      0.277  
210377      0.400  
210376      0.774  

Verifying probabilities sum to 1:
210362    1.001
210454    1.000
210378    1.000
210377    1.000
210376    0.999
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_matches['prob_away'] = predicted_probs[:, 0]  # Probability of Away win
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_matches['prob_draw'] = predicted_probs[:, 1]  # Probability of Draw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_matches['prob_home'] = predicted_probs

We need to do a training, validation and testing split. Explain why.