In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data
seasons_15To24 = pd.read_csv("Premier League 2015-2024.csv")
season_24To25 = pd.read_csv("24-25 Season.csv")

# Drop 'Referee' and 'Date' from both datasets
seasons_15To24 = seasons_15To24.drop(columns=['Referee', 'Date', 'HomeTeam', 'AwayTeam'])
season_24To25 = season_24To25.drop(columns=['Referee', 'Date', 'HomeTeam', 'AwayTeam'])

# I want to try removing some column later on.

In [None]:
numeric_cols = ['HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']
categorical_cols = ['HomeTeam', 'AwayTeam', 'Referee']

In [None]:
# process the data

# first label Encode FTR and HTR, excpet im going to just map
result_mapping = {'H': 0, 'A': 1, 'D': 2}

seasons_15To24['FTR'] = seasons_15To24['FTR'].replace(result_mapping).astype(int)
seasons_15To24['HTR'] = seasons_15To24['HTR'].replace(result_mapping).astype(int)
season_24To25['FTR'] = season_24To25['FTR'].replace(result_mapping).astype(int)
season_24To25['HTR'] = season_24To25['HTR'].replace(result_mapping).astype(int)

In [None]:
seasons_15To24.to_csv("Output.csv", index=False)

In [None]:
# output columns
X_seasons_15To24 = seasons_15To24.drop(columns=['FTR'])
y_seasons_15To24 = seasons_15To24['FTR']
X_season_24To25 = season_24To25.drop(columns=['FTR'])
y_season_24To25 = season_24To25['FTR']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_seasons_15To24_scaled = scaler.fit_transform(X_seasons_15To24)
X_season_24To25_scaled = scaler.transform(X_season_24To25)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

rf_model = RandomForestClassifier()
rf_model.fit(X_seasons_15To24_scaled, y_seasons_15To24)

y_train_pred = rf_model.predict(X_seasons_15To24_scaled)
y_test_pred = rf_model.predict(X_season_24To25_scaled)

f1_train_rf = f1_score(y_seasons_15To24, y_train_pred, average='macro')
accuracy_train_rf = accuracy_score(y_seasons_15To24, y_train_pred)

f1_test_rf = f1_score(y_season_24To25, y_test_pred, average='macro')
accuracy_test_rf = accuracy_score(y_season_24To25, y_test_pred)

print("RF F1 Score (Training):", f1_train_rf * 100)
print("RF Accuracy (Training):", accuracy_train_rf * 100)
print("RF F1 Score (Testing):", f1_test_rf * 100)
print("RF Accuracy (Testing):", accuracy_test_rf * 100)

RF F1 Score (Training): 100.0
RF Accuracy (Training): 100.0
RF F1 Score (Testing): 51.81864919899287
RF Accuracy (Testing): 59.040590405904055


Extreme overfitting

In [None]:
rf_model = RandomForestClassifier(max_depth=4, criterion='entropy', n_estimators=100)
rf_model.fit(X_seasons_15To24_scaled, y_seasons_15To24)

y_train_pred = rf_model.predict(X_seasons_15To24_scaled)
y_test_pred = rf_model.predict(X_season_24To25_scaled)

f1_train_rf = f1_score(y_seasons_15To24, y_train_pred, average='macro')
accuracy_train_rf = accuracy_score(y_seasons_15To24, y_train_pred)

f1_test_rf = f1_score(y_season_24To25, y_test_pred, average='macro')
accuracy_test_rf = accuracy_score(y_season_24To25, y_test_pred)

print("RF F1 Score (Training):", f1_train_rf * 100)
print("RF Accuracy (Training):", accuracy_train_rf * 100)
print("RF F1 Score (Testing):", f1_test_rf * 100)
print("RF Accuracy (Testing):", accuracy_test_rf * 100)

RF F1 Score (Training): 51.90278875315132
RF Accuracy (Training): 65.78947368421053
RF F1 Score (Testing): 49.17002747694121
RF Accuracy (Testing): 62.36162361623616


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_boost_model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, criterion='entropy'), n_estimators=100)
ada_boost_model.fit(X_seasons_15To24_scaled, y_seasons_15To24)

y_train_pred = ada_boost_model.predict(X_seasons_15To24_scaled)
y_test_pred = ada_boost_model.predict(X_season_24To25_scaled)

f1_train_ab = f1_score(y_seasons_15To24, y_train_pred, average='macro')
accuracy_train_ab = accuracy_score(y_seasons_15To24, y_train_pred)
f1_test_ab = f1_score(y_season_24To25, y_test_pred, average='macro')
accuracy_test_ab = accuracy_score(y_season_24To25, y_test_pred)

print("AdaBoost F1 Score (Training):", f1_train_ab * 100)
print("AdaBoost Accuracy (Training):", accuracy_train_ab * 100)
print("AdaBoost F1 Score (Testing):", f1_test_ab * 100)
print("AdaBoost Accuracy (Testing):", accuracy_test_ab * 100)


AdaBoost F1 Score (Training): 57.789805831579564
AdaBoost Accuracy (Training): 65.05847953216374
AdaBoost F1 Score (Testing): 58.834322942437744
AdaBoost Accuracy (Testing): 65.68265682656826
