# Model Training

In [1]:
#%pip install pandas
#!pip3 install scikit-learn
#%pip install xgboost 

import sys
import pandas as pd
import sqlite3
import pathlib
import numpy as np

sys.path.append("model-training") 
import modelling_functions as mf
import training_config as tc

In [2]:
# Get to the root directory
project_root = pathlib.Path().absolute().parent.parent

# Now construct the relative path to your SQLite database
db_path = project_root / "data" / "footy-tipper-db.sqlite"

# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Read the data into a pandas dataframe
footy_tipping_data = pd.read_sql_query("select * from footy_tipping_data", con)

# Don't forget to close the connection
con.close()

## Modelling

In [3]:
# # Xgboost
# xgb_estimator = xgb.XGBClassifier(n_jobs=-1)
# xgb_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.3, 0.5, 0.7],
#     'gamma': [0, 0.1, 0.2]
# }

# # Train the model and get the label encoder and game_id_inference
# tuned_model, X_inference, label_encoder, game_id_inference = mf.train_model_pipeline(
#     footy_tipping_data, tc.predictors, tc.outcome_var,
#     xgb_estimator, xgb_param_grid,
#     use_rfe=tc.use_rfe, num_folds=tc.num_folds, 
#     opt_metric=tc.opt_metric
# )

In [4]:
# # RandomForest
# rf_estimator = RandomForestClassifier(n_jobs=-1, class_weight='balanced')
# rf_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_features': ['sqrt', 'log2'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # Train the model and get the label encoder and game_id_inference
# tuned_model, X_inference, label_encoder, game_id_inference = mf.train_model_pipeline(
#     footy_tipping_data, tc.predictors, tc.outcome_var,
#     rf_estimator, rf_param_grid,
#     use_rfe=tc.use_rfe, num_folds=tc.num_folds, 
#     opt_metric=tc.opt_metric
# )

In [5]:
# # GradientBoosting
# gb_estimator = GradientBoostingClassifier()
# gb_param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'subsample': [0.8, 0.9, 1.0],
#     'max_features': ['sqrt', 'log2']
# }

# # Train the model and get the label encoder and game_id_inference
# tuned_model, X_inference, label_encoder, game_id_inference = mf.train_model_pipeline(
#     footy_tipping_data, tc.predictors, tc.outcome_var,
#     gb_estimator, gb_param_grid,
#     use_rfe=tc.use_rfe, num_folds=tc.num_folds, 
#     opt_metric=tc.opt_metric
# )

In [6]:
best_model, X_inference, label_encoder, game_id_inference = mf.train_and_select_best_model(
    footy_tipping_data, tc.predictors, tc.outcome_var,
    tc.use_rfe, tc.num_folds, tc.opt_metric
)

best_model

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
{'colsample_bytree': 0.7, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.9}
0.7776161452390961
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
{'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
0.7617700514421826
Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
{'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200, 'subsample': 0.8}
0.7671731638125081


## Make Predictions

In [7]:
predictions_df = mf.model_predictions(best_model, X_inference, label_encoder, game_id_inference)
predictions_df

Unnamed: 0,game_id,home_team_result,home_team_win_prob,home_team_lose_prob
0,20231110000.0,Loss,0.337083,0.662917
1,20231110000.0,Loss,0.258418,0.741582
2,20231110000.0,Win,0.861045,0.138955
3,20231110000.0,Win,0.822576,0.177424
4,20231110000.0,Win,0.910979,0.089021
5,20231110000.0,Win,0.786747,0.213253
6,20231110000.0,Win,0.631769,0.368231


## Write them back to the database

In [5]:
# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Create the table if it does not exist
con.execute('''
    CREATE TABLE IF NOT EXISTS predictions_table (
        game_id INTEGER PRIMARY KEY,
        home_team_result TEXT,
        home_team_win_prob REAL,
        home_team_lose_prob REAL
    )
''')

# Write each row in the DataFrame to the database
for index, row in predictions_df.iterrows():
    con.execute('''
        INSERT INTO predictions_table (
            game_id,  
            home_team_result, 
            home_team_win_prob, 
            home_team_lose_prob
        ) VALUES (?, ?, ?, ?)
        ON CONFLICT(game_id) DO UPDATE SET
            home_team_result= excluded.home_team_result,
            home_team_win_prob = excluded.home_team_win_prob,
            home_team_lose_prob = excluded.home_team_lose_prob
    ''', (
        row['game_id'], 
        row['home_team_result'],
        row['home_team_win_prob'],
        row['home_team_lose_prob']
    ))

# Commit the transaction
con.commit()

# Close the connection
con.close()


In [6]:
# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Read the data from predictions_table into a pandas DataFrame
predictions_table = pd.read_sql_query("SELECT * FROM predictions_table", con)

# Close the connection
con.close()

In [10]:
predictions_table

Unnamed: 0,game_id,home_team_result,home_team_win_prob,home_team_lose_prob
0,20231111610,Loss,0.443841,0.556159
1,20231111620,Win,0.743889,0.256111
2,20231111630,Win,0.738226,0.261774
3,20231111640,Loss,0.311119,0.688881
4,20231111650,Win,0.783201,0.216799
