# Model Training

In [1]:
#%pip install pandas
#!pip3 install scikit-learn
#%pip install xgboost 

import sys
import pandas as pd
import sqlite3
import pathlib
import numpy as np

import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.utils import class_weight

sys.path.append("model-training") 
import modelling_functions as mf
import training_config as tc

In [2]:
# Get to the root directory
project_root = pathlib.Path().absolute().parent.parent

# Now construct the relative path to your SQLite database
db_path = project_root / "data" / "footy-tipper-db.sqlite"

# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Read the data into a pandas dataframe
footy_tipping_data = pd.read_sql_query("select * from footy_tipping_data", con)

# Don't forget to close the connection
con.close()

## Modelling

In [3]:
xgb_estimator = xgb.XGBClassifier(n_jobs=-1)
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'gamma': [0, 0.1, 0.2]
}

# Train the model and get the label encoder and game_id_inference
tuned_model, X_inference, label_encoder, game_id_inference = mf.train_model_pipeline(
    footy_tipping_data, tc.predictors, tc.outcome_var,
    xgb_estimator, xgb_param_grid,
    use_rfe=False, num_folds=tc.num_folds, opt_metric=tc.opt_metric
)


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
0.7341048900010427


## Make Predictions

In [4]:
predictions_df = mf.model_predictions(tuned_model, X_inference, label_encoder, game_id_inference)
predictions_df

Unnamed: 0,game_id,home_team_result,home_team_win_prob,home_team_lose_prob
0,20231110000.0,Loss,0.454515,0.545485
1,20231110000.0,Loss,0.411844,0.588156
2,20231110000.0,Win,0.633277,0.366723
3,20231110000.0,Win,0.622936,0.377064
4,20231110000.0,Win,0.622603,0.377397
5,20231110000.0,Win,0.591078,0.408922
6,20231110000.0,Win,0.546383,0.453617


## Write them back to the database

In [5]:
# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Create the table if it does not exist
con.execute('''
    CREATE TABLE IF NOT EXISTS predictions_table (
        game_id INTEGER PRIMARY KEY,
        home_team_result TEXT,
        home_team_win_prob REAL,
        home_team_lose_prob REAL
    )
''')

# Write each row in the DataFrame to the database
for index, row in predictions_df.iterrows():
    con.execute('''
        INSERT INTO predictions_table (
            game_id,  
            home_team_result, 
            home_team_win_prob, 
            home_team_lose_prob
        ) VALUES (?, ?, ?, ?)
        ON CONFLICT(game_id) DO UPDATE SET
            home_team_result= excluded.home_team_result,
            home_team_win_prob = excluded.home_team_win_prob,
            home_team_lose_prob = excluded.home_team_lose_prob
    ''', (
        row['game_id'], 
        row['home_team_result'],
        row['home_team_win_prob'],
        row['home_team_lose_prob']
    ))

# Commit the transaction
con.commit()

# Close the connection
con.close()


In [6]:
# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Read the data from predictions_table into a pandas DataFrame
predictions_table = pd.read_sql_query("SELECT * FROM predictions_table", con)

# Close the connection
con.close()

In [7]:
predictions_table

Unnamed: 0,game_id,home_team_result,home_team_win_prob,home_team_lose_prob
0,20231111610,Loss,0.443841,0.556159
1,20231111620,Win,0.743889,0.256111
2,20231111630,Win,0.738226,0.261774
3,20231111640,Loss,0.311119,0.688881
4,20231111650,Win,0.783201,0.216799
5,20231111710,Loss,0.454515,0.545485
6,20231111720,Loss,0.411844,0.588156
7,20231111730,Win,0.633277,0.366723
8,20231111740,Win,0.622936,0.377064
9,20231111750,Win,0.622603,0.377397


# Other models

In [8]:
# # RandomForest
# rf_estimator = RandomForestClassifier(n_jobs=-1, class_weight='balanced')
# rf_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_features': ['sqrt', 'log2'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }
# cv_rf = mf.training_pipeline(train_df, rf_estimator, rf_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)

In [9]:
# # GradientBoosting
# gb_estimator = GradientBoostingClassifier()
# gb_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'subsample': [0.8, 0.9, 1.0],
#     'max_features': ['sqrt', 'log2']
# }
# cv_gb = mf.training_pipeline(train_df, gb_estimator, gb_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)

In [10]:
# # LightGBM
# lgb_estimator = lgb.LGBMClassifier(n_jobs=-1)
# xgb_param_grid = {
#     'n_estimators': [50, 100, 200, 500],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.3, 0.5, 0.7],
#     'gamma': [0, 0.1, 0.2]
# }
# cv_lgb = training_pipeline(train_df, lgb_estimator, lgb_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)