# Model Training

In [None]:
#%pip install pandas
#!pip3 install scikit-learn
#%pip install matplotlib
#%pip install xgboost 
#%pip install git+https://github.com/Microsoft/LightGBM


import sys
import pandas as pd
import sqlite3
import pathlib

import xgboost as xgb
#import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

sys.path.append("model-training") 
import modelling_functions as mf
import training_config as tc

In [None]:
# Get to the root directory
project_root = pathlib.Path().absolute().parent.parent

# Now construct the relative path to your SQLite database
db_path = project_root / "data" / "footy-tipper-db.sqlite"

# Connect to the SQLite database
con = sqlite3.connect(str(db_path))

# Read the data into a pandas dataframe (assuming the table is named 'my_table')
train_df = pd.read_sql_query("SELECT * FROM train_df", con)

# Don't forget to close the connection
con.close()

In [None]:
# RandomForest
rf_estimator = RandomForestClassifier(n_jobs=-1)
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
cv_rf = mf.training_pipeline(train_df, rf_estimator, rf_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)

In [None]:
# GradientBoosting
gb_estimator = GradientBoostingClassifier()
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2']
}
cv_gb = mf.training_pipeline(train_df, gb_estimator, gb_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)

In [None]:
# XGBoost
xgb_estimator = xgb.XGBClassifier(n_jobs=-1)
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'gamma': [0, 0.1, 0.2]
}
cv_xgb = mf.training_pipeline(train_df, xgb_estimator, xgb_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)

In [None]:
# # LightGBM
# lgb_estimator = lgb.LGBMClassifier(n_jobs=-1)
# xgb_param_grid = {
#     'n_estimators': [50, 100, 200, 500],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.3, 0.5, 0.7],
#     'gamma': [0, 0.1, 0.2]
# }
# cv_lgb = training_pipeline(train_df, lgb_estimator, lgb_param_grid, tc.outcome_var, tc.predictors, tc.opt_metric)