In [27]:
import pandas as pd
import numpy as np
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score

In [12]:
df = pd.read_csv("df_opt.csv")
df = df[df['season'] != 2013]

features = ['home_win', 'away_win', 'pi_diff','gd_diff', 
            'elo_gls_diff', 'home_ip_scaled', 'away_ip_scaled',
       'home_ip', 'away_ip', 'ip_diff', 'season']

df = df[features].astype(float)

In [28]:
class Optimiser:
    
    def __init__(self, df, model_name, verbosity=False):
        self.df = df
        self.model_name = model_name
        self.verbosity = verbosity
        
        if self.model_name == "XGB":
            self.model = XGBClassifier()
        elif self.model_name == "LGBM":
            self.model = LGBMClassifier()
        elif self.model_name == "GBC":
            self.model = GradientBoostingClassifier()
        elif self.model_name == "CAT":
            self.model = CatBoostClassifier(verbose = verbosity)
            
    def process_df(self, df):
        
        df = df.astype(float)
        df = df[df['season'] != 2013]
        df = df.dropna()
        train = df[df['season'] != 2022.]
        test = df[df['season'] == 2022.]
        
        X_train = train.drop(['home_win', 'away_win', 'season'], axis=1)
        X_test = test.drop(['home_win', 'away_win', 'season'], axis=1)
        y_train = train['home_win']
        y_test = test['home_win']

        return X_train, X_test, y_train, y_test
    
    def objective(self, trial):


        if self.model_name == "XGB":
            params = {
                
            'booster': 'gbtree',
            'max_depth': trial.suggest_int('max_depth', 1, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
            'scale_pos_weight': (0.45/0.55),
            'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.1),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10, log=True),
            'gamma': trial.suggest_float('gamma', 1e-5, 10, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
                
        }
            model = XGBClassifier(**params)

        elif self.model_name == "LGBM":
            
            params = {
                'boosting_type': 'gbdt',
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
                'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.1),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.1),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10, log=True),
                'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 10, log=True)
            }

            model = LGBMClassifier(**params)
            
        elif self.model_name == "GBC":
            
            params = {
                'loss': 'deviance',
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
                'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.1),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
                'random_state': 42
            }

            model = GradientBoostingClassifier(**params)
            
        elif self.model_name == "CAT":

            params = {
                'iterations': trial.suggest_int('iterations', 100, 1000, step=100),
                'depth': trial.suggest_int('depth', 1, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
                'random_seed': 42,
                'loss_function': 'Logloss',
                'eval_metric': 'Accuracy',
                'bootstrap_type': 'Bayesian',
                'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.1),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 1.0, step=0.1),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10, log=True),
                'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
                'verbose': self.verbosity
            }

            model = CatBoostClassifier(**params)
        
        X_train, X_test, y_train, y_test = self.process_df(self.df)
        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_prob[:, 1])

        return auc

    def optimise(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.process_df(self.df)

        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=100, show_progress_bar=self.verbosity)
        best_params = study.best_params
        
        if self.model_name == "XGB":
            best_model = XGBClassifier(**best_params)
            
        elif  self.model_name == "LGBM":
            best_model = LGBMClassifier(**best_params)
        
        elif  self.model_name == "GBC":
            best_model = GradientBoostingClassifier(**best_params)
            
        elif  self.model_name == "CAT":
            best_model = CatBoostClassifier(**best_params)
            
        kfold = StratifiedKFold(n_splits=5, shuffle=True)
        results = cross_val_score(self.model, self.X_train, self.y_train, cv=kfold, verbose=self.verbosity)
        
        print("Results of K-fold CV for", self.model, ":", results)
        print("Mean:", results.mean())
        print("Std:", results.std())

        best_model.fit(self.X_train, self.y_train, 
                       early_stopping_rounds=1000, 
                       eval_set=[(self.X_test, self.y_test)],
                       verbose=self.verbosity)

        y_pred_test = best_model.predict(self.X_test)
        y_prob_test = best_model.predict_proba(self.X_test)[:, 1]

        test_accuracy = accuracy_score(self.y_test, y_pred_test)
        test_f1 = f1_score(self.y_test, y_pred_test)

        test_roc_auc_score = roc_auc_score(self.y_test, y_prob_test)
        test_precision_score = precision_score(self.y_test, y_pred_test)

        print("Best Hyperparameters:", best_params)
        print("Test Accuracy:", test_accuracy)
        print("Test F1 Score:", test_f1)
        print("Test AUC:", test_roc_auc_score)
        print("Test Precision Score:", test_precision_score)

        return best_model
