Notebook to explore other approaches, such as using a neural network to predict the target variable.

# Testing Random Forrest

In [75]:
%load_ext autoreload
%autoreload 2

# Control figure size
figsize=(14, 4)

import pandas as pd
from util import util
import numpy as np
import os
data_folder = os.path.join('..', 'data')
file_name = "DataForModel"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
data = util.load_data(data_folder, file_name)
data

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,...,Diff_change_in_ELO,Diff_opposition_mean_ELO,Diff_shots_on_target_attempted,Diff_shots_on_target_allowed,Diff_shots_attempted,Diff_shots_allowed,Diff_corners_awarded,Diff_corners_conceded,Diff_yellow_cards,Diff_red_cards
0,E0,2005-09-17,Aston Villa,Tottenham,1.0,1.0,0506,-25.173204,0.423508,0.224320,...,-4.599564,-15.115140,-9,10,-14,16,20,18,-6,0
1,E0,2005-09-17,Portsmouth,Birmingham,1.0,1.0,0506,6.045620,0.478503,0.202921,...,26.619260,-5.363651,4,-2,4,-4,0,13,1,0
2,E0,2005-09-17,Sunderland,West Brom,1.0,1.0,0506,-32.751187,0.410018,0.229569,...,-12.177547,17.786764,9,-1,-4,8,5,0,-3,1
3,E0,2005-09-18,Blackburn,Newcastle,0.0,3.0,0506,34.014412,0.526477,0.184254,...,34.014412,3.552154,1,-13,7,-15,5,-14,1,0
4,E0,2005-09-18,Man City,Bolton,0.0,1.0,0506,33.333649,0.525329,0.184700,...,37.907289,3.634728,3,3,-8,18,-4,2,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,E0,2024-05-19,Crystal Palace,Aston Villa,5.0,0.0,2324,-90.655703,0.335050,0.213414,...,72.723069,-65.930720,15,-16,28,-42,5,-5,4,1
6251,E0,2024-05-19,Liverpool,Wolves,2.0,0.0,2324,243.468628,0.806489,0.075298,...,4.430033,-47.974268,22,-6,56,-18,16,-9,-10,0
6252,E0,2024-05-19,Luton,Fulham,2.0,4.0,2324,-112.212233,0.312037,0.198755,...,-15.373054,-58.254969,-4,11,-14,35,7,4,-1,-1
6253,E0,2024-05-19,Man City,West Ham,3.0,1.0,2324,314.894768,0.864611,0.052681,...,33.032141,-20.597507,14,-28,2,-59,0,-13,-4,0


In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6255 entries, 0 to 6254
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Div                             6255 non-null   object        
 1   Date                            6255 non-null   datetime64[ns]
 2   HomeTeam                        6255 non-null   object        
 3   AwayTeam                        6255 non-null   object        
 4   FTHG                            6255 non-null   float64       
 5   FTAG                            6255 non-null   float64       
 6   Season                          6255 non-null   object        
 7   ELO diff                        6255 non-null   float64       
 8   Home_prob_ELO                   6255 non-null   float64       
 9   Draw_prob_ELO                   6255 non-null   float64       
 10  Away_prob_ELO                   6255 non-null   float64       
 11  Diff

### Handling Non-numeric Data
The first step is to convert the non-numeric data into numeric data. This can be done using the `LabelEncoder` class from the `sklearn.preprocessing` module. For this forst test i will remove the column Div since all data is from E0 anyway.

In [78]:
# Removing div column:
data.drop(columns="Div", inplace=True)


In [79]:
from sklearn.preprocessing import LabelEncoder


In [80]:
data = data.copy()
label_encoder = LabelEncoder()

#Convert Date to numerial values
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day
data["DayOfWeek"] = data[
    "Date"
].dt.dayofweek  # Optional

# Drop the original Date column as it’s no longer needed
data = data.drop(columns=["Date"])

In [81]:
# One hot encoding of hometeam and awayteam
data = pd.get_dummies(
    data, columns=["HomeTeam", "AwayTeam"]
)

In [82]:
print(data.dtypes)

FTHG                  float64
FTAG                  float64
Season                 object
ELO diff              float64
Home_prob_ELO         float64
                       ...   
AwayTeam_Watford         bool
AwayTeam_West Brom       bool
AwayTeam_West Ham        bool
AwayTeam_Wigan           bool
AwayTeam_Wolves          bool
Length: 111, dtype: object


### Defining target variable

In [83]:
# Add a new column Outcome which is 1 if HomeTeam wins, 0 if draw, -1 if AwayTeam wins
data["Outcome"] = data.apply(
    lambda row: (
        1 if row["FTHG"] > row["FTAG"] else (0 if row["FTHG"] == row["FTAG"] else -1)
    ),
    axis=1,
)

In [84]:
data


Unnamed: 0,FTHG,FTAG,Season,ELO diff,Home_prob_ELO,Draw_prob_ELO,Away_prob_ELO,Diff_goals_scored,Diff_goals_conceded,Matchrating,...,AwayTeam_Stoke,AwayTeam_Sunderland,AwayTeam_Swansea,AwayTeam_Tottenham,AwayTeam_Watford,AwayTeam_West Brom,AwayTeam_West Ham,AwayTeam_Wigan,AwayTeam_Wolves,Outcome
0,1.0,1.0,0506,-25.173204,0.423508,0.224320,0.352172,0,6,-6,...,False,False,False,True,False,False,False,False,False,0
1,1.0,1.0,0506,6.045620,0.478503,0.202921,0.318576,0,-1,1,...,False,False,False,False,False,False,False,False,False,0
2,1.0,1.0,0506,-32.751187,0.410018,0.229569,0.360413,-3,-1,-2,...,False,False,False,False,False,True,False,False,False,0
3,0.0,3.0,0506,34.014412,0.526477,0.184254,0.289270,2,-2,4,...,False,False,False,False,False,False,False,False,False,-1
4,0.0,1.0,0506,33.333649,0.525329,0.184700,0.289971,1,0,1,...,False,False,False,False,False,False,False,False,False,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,5.0,0.0,2324,-90.655703,0.335050,0.213414,0.451536,5,-3,8,...,False,False,False,False,False,False,False,False,False,1
6251,2.0,0.0,2324,243.468628,0.806489,0.075298,0.118214,8,-2,10,...,False,False,False,False,False,False,False,False,True,1
6252,2.0,4.0,2324,-112.212233,0.312037,0.198755,0.489208,1,8,-7,...,False,False,False,False,False,False,False,False,False,-1
6253,3.0,1.0,2324,314.894768,0.864611,0.052681,0.082707,10,-14,24,...,False,False,False,False,False,False,True,False,False,1


In [85]:
print("Outcome distribution:")
print(data["Outcome"].value_counts())

Outcome distribution:
Outcome
 1    2907
-1    1847
 0    1501
Name: count, dtype: int64


### Split data into training and testing sets

In [90]:
X = data.copy().drop(
    columns=["Outcome", "FTHG", "FTAG", "Season"]
)  # Drop columns not needed for prediction
y = data.copy()["Outcome"]

In [91]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [92]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
model.fit(X_train, y_train)

In [93]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(
    "Classification Report:\n",
    classification_report(
        y_test, y_pred, target_names=["Away Win", "Draw", "Home Win"]
    ),
)

Accuracy: 0.574740207833733
Classification Report:
               precision    recall  f1-score   support

    Away Win       0.51      0.57      0.54       368
        Draw       0.36      0.06      0.10       268
    Home Win       0.62      0.80      0.70       615

    accuracy                           0.57      1251
   macro avg       0.50      0.48      0.45      1251
weighted avg       0.53      0.57      0.52      1251

