<a href="https://colab.research.google.com/github/jake321southall/football-data-analysis/blob/main/Football_GBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split

In [8]:
url = "https://raw.githubusercontent.com/tara-nguyen/english-premier-league-datasets-for-10-seasons/main/epldat10seasons/epl-allseasons-matchstats.csv"
df = pd.read_csv(url)

#Let's drop the columns we won't use
df.drop(columns = ['Referee', 'HomeTeam', 'AwayTeam'], inplace=True)
df = pd.get_dummies(df, columns = ['FullTime'])
df_num = df.drop(columns = ['Season', 'Date', 'Halftime'])
df_homewin = df_num.drop(columns = ['HomeGoals', 'AwayGoals', 'FullTime_Draw', 'FullTime_AwayWin'])
df_homewin.drop(index=2070, inplace=True)

X = df_homewin.iloc[:, 0:14]
y = df_homewin.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

X_train_init, X_val_init, y_train_init, y_val_init = train_test_split(X_train, y_train, test_size=0.2, random_state=0)



In [9]:
#creating inital model
gbm_init = lgb.LGBMClassifier()
gbm_init.fit(X_train_init, y_train_init)

print(f'The percentage of correctly predicted games of inital model in training set is is {round(100*gbm_init.score(X_train_init, y_train_init), 5)}%.')
print(f'The percentage of correctly predicted games of inital model in validation set is is {round(100*gbm_init.score(X_val_init, y_val_init), 5)}%.')

The percentage of correctly predicted games of inital model in training set is is 96.38009%.
The percentage of correctly predicted games of inital model in validation set is is 72.36842%.


In [10]:
y_pred_init = gbm_init.score(X_test, y_test)
print(f'The percentage of correctly predicted games of inital model is {round(100*y_pred_init, 5)}%.')

The percentage of correctly predicted games of inital model is 73.55263%.


We are overfitting!


In [11]:
#creating final model with hyperparameter tuning
estimator = lgb.LGBMClassifier()

param_grid = {
    'lambda_l1': np.logspace(-1,1,5),
    'num_leaves': np.arange(3,24,3),
    'max_depth': np.arange(2,6),
    'bagging_fraction': [0.4,0.5,0.6],
    'n_estimators': [50, 75,100],
}

cv = GridSearchCV(estimator, param_grid, cv=10)
cv.fit(X_train, y_train)

print(f'Best parameters found by grid search are: {cv.best_params_}')
print(f'Best score found by grid search are: {cv.best_score_}')

Best parameters found by grid search are: {'bagging_fraction': 0.4, 'lambda_l1': 0.31622776601683794, 'max_depth': 2, 'n_estimators': 50, 'num_leaves': 3}
Best score found by grid search are: 0.7792014069828036


In [12]:
y_pred_cv = cv.score(X_test, y_test)
print(f'The percentage of correctly predicted games of lightgbm cv model on test set is is {round(100*y_pred_cv, 5)}%.')


The percentage of correctly predicted games of lightgbm cv model on test set is is 76.57895%.
