In [1]:
%load_ext autoreload
%autoreload 2

# Control figure size
figsize=(14, 4)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import random
import pandas as pd
from util import util
import numpy as np
import os
data_folder = os.path.join('..', 'data')
file_name = "DataForModel"

# Regression Filter

### Load data

In [2]:
data = util.load_data(data_folder, file_name)
data = data[data['Div'] == 'E0']

data


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,...,Diff_shots_on_target_attempted,Diff_shots_on_target_allowed,Diff_shots_attempted,Diff_shots_allowed,Diff_corners_awarded,Diff_corners_conceded,Diff_fouls_commited,Diff_fouls_suffered,Diff_yellow_cards,Diff_red_cards
0,E0,2005-09-17,Aston Villa,Tottenham,1.0,1.0,0506,-25.173204,0.412832,0.245673,...,-9,10,-14,16,20,18,-13,9,-6,0
1,E0,2005-09-17,Portsmouth,Birmingham,1.0,1.0,0506,6.045620,0.468846,0.222236,...,4,-2,4,-4,0,13,6,17,1,0
2,E0,2005-09-17,Sunderland,West Brom,1.0,1.0,0506,-32.751187,0.399092,0.251422,...,9,-1,-4,8,5,0,-1,-21,-3,1
3,E0,2005-09-18,Blackburn,Newcastle,0.0,3.0,0506,34.014412,0.517707,0.201792,...,1,-13,7,-15,5,-14,0,-2,1,0
4,E0,2005-09-18,Man City,Bolton,0.0,1.0,0506,33.333649,0.516538,0.202282,...,3,3,-8,18,-4,2,-6,-17,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32523,E0,2024-05-19,Crystal Palace,Aston Villa,5.0,0.0,2324,-90.969466,0.324565,0.233493,...,15,-16,28,-42,5,-5,9,-12,4,1
32524,E0,2024-05-19,Liverpool,Wolves,2.0,0.0,2324,243.886381,0.803305,0.082298,...,22,-6,56,-18,16,-9,-5,-14,-10,0
32525,E0,2024-05-19,Luton,Fulham,2.0,4.0,2324,-127.316525,0.287220,0.206627,...,-4,11,-14,35,7,4,18,10,-1,-1
32526,E0,2024-05-19,Man City,West Ham,3.0,1.0,2324,315.526105,0.862550,0.057509,...,14,-28,2,-59,0,-13,-12,-1,-4,0


In [3]:
data.drop(columns="Div", inplace=True)
from sklearn.preprocessing import LabelEncoder
data = data.copy()
label_encoder = LabelEncoder()

#Convert Date to numerial values
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day
data["DayOfWeek"] = data[
    "Date"
].dt.dayofweek  # Optional

# Drop the original Date column as it’s no longer needed
data = data.drop(columns=["Date"])

In [4]:
data = pd.get_dummies(
    data, columns=["HomeTeam", "AwayTeam"]
)

In [5]:
data["Outcome"] = data.apply(
    lambda row: (
        row["FTHG"] - row["FTAG"]
    ),
    axis=1,
)

In [6]:
data

Unnamed: 0,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,Away_prob_ELO,Diff_goals_scored,Diff_goals_conceded,Diff_goal_diff,...,AwayTeam_Stoke,AwayTeam_Sunderland,AwayTeam_Swansea,AwayTeam_Tottenham,AwayTeam_Watford,AwayTeam_West Brom,AwayTeam_West Ham,AwayTeam_Wigan,AwayTeam_Wolves,Outcome
0,1.0,1.0,0506,-25.173204,0.412832,0.245673,0.341496,0,6,-6,...,False,False,False,True,False,False,False,False,False,0.0
1,1.0,1.0,0506,6.045620,0.468846,0.222236,0.308918,0,-1,1,...,False,False,False,False,False,False,False,False,False,0.0
2,1.0,1.0,0506,-32.751187,0.399092,0.251422,0.349487,-3,-1,-2,...,False,False,False,False,False,True,False,False,False,0.0
3,0.0,3.0,0506,34.014412,0.517707,0.201792,0.280500,2,-2,4,...,False,False,False,False,False,False,False,False,False,-3.0
4,0.0,1.0,0506,33.333649,0.516538,0.202282,0.281180,1,0,1,...,False,False,False,False,False,False,False,False,False,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32523,5.0,0.0,2324,-90.969466,0.324565,0.233493,0.441941,5,-3,8,...,False,False,False,False,False,False,False,False,False,5.0
32524,2.0,0.0,2324,243.886381,0.803305,0.082298,0.114397,8,-2,10,...,False,False,False,False,False,False,False,False,True,2.0
32525,2.0,4.0,2324,-127.316525,0.287220,0.206627,0.506153,1,8,-7,...,False,False,False,False,False,False,False,False,False,-2.0
32526,3.0,1.0,2324,315.526105,0.862550,0.057509,0.079941,10,-14,24,...,False,False,False,False,False,False,True,False,False,2.0


In [7]:
print("Outcome distribution:")
print(data["Outcome"].value_counts())

Outcome distribution:
Outcome
 0.0    1501
 1.0    1338
-1.0     984
 2.0     893
-2.0     504
 3.0     408
-3.0     231
 4.0     163
-4.0      91
 5.0      76
-5.0      27
 6.0      16
-6.0       7
 7.0       7
 8.0       5
-9.0       1
-7.0       1
 9.0       1
-8.0       1
Name: count, dtype: int64


In [8]:
X = data.copy().drop(
    columns=["Outcome", "FTHG", "FTAG", "Season"]
)  # Drop columns not needed for prediction
y = data.copy()["Outcome"]

from sklearn.model_selection import train_test_split

# Split data into training and test sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

### Random forest Regressor instead of classifier


In [9]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train,y_train)



In [10]:
predictions = rf.predict(X_test)

In [11]:
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2.6738375299760193


### Turning expected goal difference into categories (Win,Loss,Draw)

In [12]:
thresholds = {"hw": 1, "hl": -1}

def categorize_preds(pred_arr, hw, hl):
    categories = np.where(pred_arr > hw, 1, 
                          np.where((pred_arr <= hw) & (pred_arr > hl), 0, 
                                   -1))
    return np.array(categories)


def categorize_goal_diff(y_test):
    categories = np.where(y_test > 0, 1, 
                        np.where((y_test == 0 ), 0, 
                                -1))
    return np.array(categories)



In [13]:
from sklearn.metrics import classification_report

categorized_preds = categorize_preds(predictions, thresholds['hw'], thresholds['hl'])
categorized_goal_diff = categorize_goal_diff(y_test)

report = classification_report(categorized_goal_diff, categorized_preds)
print(report)


              precision    recall  f1-score   support

          -1       0.74      0.20      0.31       368
           0       0.25      0.79      0.38       268
           1       0.76      0.38      0.50       615

    accuracy                           0.41      1251
   macro avg       0.59      0.45      0.40      1251
weighted avg       0.65      0.41      0.42      1251



### Filter out matches we are uncertain about (Where expected goal diff is within [-1,1])

In [14]:


def remove_uncertain(predictions, targets):
    # Create a boolean mask where predictions is not 0
    mask = predictions != 0
    
    # Use the mask to filter both predictions and targets
    filtered_predictions = predictions[mask]
    filtered_targets = targets[mask]
    
    return filtered_predictions, filtered_targets

filtered_predictions, filtered_targets = remove_uncertain(categorized_preds, categorized_goal_diff)


### Get much better results by filtering out

In [15]:
report = classification_report(filtered_predictions, filtered_targets)
print(report)




              precision    recall  f1-score   support

          -1       0.73      0.74      0.74        98
           0       0.00      0.00      0.00         0
           1       0.95      0.76      0.85       304

    accuracy                           0.76       402
   macro avg       0.56      0.50      0.53       402
weighted avg       0.89      0.76      0.82       402



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Now we only predict 30% of the matches, but with higher confidence

In [16]:
print(f'Bet Accceptance Rate: {len(filtered_predictions)/len(categorized_preds)}')

Bet Accceptance Rate: 0.3213429256594724
