In [1]:
import os
import sys

root_path = os.path.abspath(os.path.join('../..')) # <- adjust such that root_path always points at the root project dir (i.e. if current file is two folders deep, use '../..'). 
if root_path not in sys.path:
    sys.path.append(root_path)

import pandas as pd
import numpy as np

import itertools
import pickle as pkl

import time

# feature generation, model training, prediction & evaluation modules
from models.trad_ml.feature_generation import FeatureGen
# from models.trad_ml.??? import ???


## Hyperparameter Tuning 

We try to find optimal parameters for two steps of the modelling process:
1. Feature generation
    - Parameters are defined in the ``params`` dictionary of the ``FeatureGen`` class. 
    - Includes parameters for the moving average computations, data preparation steps (scaling, encoding, pca, NA value treatment) and more.
2. Model training
    - Parameters are defined in the ``params`` dictionary of the ``Training`` class.
    - Includes model parameters for the specific model.

### Types of Models

We include a range of different model types (which can accept our feature dataframe as input) in the optimization process. 

#### Linear Models
- Linear Regression
- Logistic Regression
- Ridge Regression
#### Tree-based Models
- xgboost
- Random Forest

### Prediction Tasks

We train (and tune) models for the following prediction tasks:
- Number of goals scored by each team
- Win/Loss/Draw (can be derived from number of goals, or predicted directly)




### Evaluation Metrics

### Optimization Process: Description

Optimization is separated by model type and prediction task. For each combination of model type & prediction task, we perform a grid search over a range of parameter value combinations ('sweep'). Promising parameter combinations might be fine-tuned further in a second round.

Results for each iteration ('run') over the parameter combinations are saved in dictionaries in a subfolder for the current sweep. (Saving each run individually ensures we don't lose intermediate results if the optimization process is interrupted.)

### Optimization Process: Implementation

#### 1. XGBoost Model

In [2]:
### 1) feature generation search space

# parameters to be varied
fg_varied = {
    'ma_alpha': [0.25, 0.5, 0.75], # the higher alpha, the more weight is put on recent observations vs. older observations
    'ma_min_periods': [0, 5], # note: we would strongly prefer low / no min_periods here (to enable predictions also for teams with only few matches in our database)
    'ma_restart_each_season': [True, False], # we would prefer False here, see above

    'h2h_feature_cols': [['result_score'], ['result_score', 'xg', 'xga']], # list of columns of which h2h features should be generated
    'h2h_alpha': [0.35, 0.75], # head2head feature EWMA alpha

    'pca_n_components': [0.95, 0.99], # only relevant when fitting new pca (note: n_components can be a fraction between zero and one, in which case the number of components is determined via the explained variance threshold)
}

# fixed parameters
fg_fixed = {
    'min_non_na_share': 0.9,

    'merge_type': 'wide', # how should feature rows of two teams be combined? -> one of ['wide', 'diff_or_ratio']

    'apply_ohe': False, # True -> one-hot encode selected features, False -> drop all categorical features
    'ohe_name': None, # load fitted ohe from file <- must not be None when generating prediction features!

    'tt_split_cutoff_date': None, # cutoff date is the most recent date to be included in training set
    'tt_split_test_season': '2022-2023',

    'apply_scaler': True,
    'scaler_name': None, # load fitted scaler from file <- must not be None when generating prediction features!
    'apply_pca': True,
    'pca_name': None, # load fitted pca from file (provide filename without .pkl suffix) <- must not be None when generating prediction features!

    'targets': ['gf', 'ga'], # one of [['gf', 'ga'], ['xg', 'xga']] or list of any single stat column.
    'target_as_diff': False # if True (and two target columns were specified), target is provided as difference between the two columns
}

### 2) model training search space: xgb params

# xgboost params to be varied
model_varied = {
    ### varied params
}
model_fixed = {
    ### fixed params
}

### instantiate feature gen and model training objects

fg = FeatureGen(params_dict={**fg_fixed, **fg_varied}) # note: loads full data set from db during first feature gen run

# modeller = ... # <- instantiate modeller object here 

# predictor = ... # <- instantiate predictor object here

# evaluator = ... # <- instantiate evaluator object here (if implemented as separate class)


# optimization sweep name
sweep_name = 'xgb_coarse_grid_search' ### <- CHANGE FOR EVERY NEW SWEEP (will create directory)
# should models be saved during the sweep (or just the result dicts)?
save_models = True

In [3]:
# model save path (same for all sweeps/runs)
model_save_path = os.path.join(root_path, 'models', 'trad_ml', 'saved_models')
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# define path for sweep results (new folder for each sweep)
results_save_path = os.path.join(root_path, 'models', 'trad_ml', 'sweep_results', sweep_name)
if not os.path.exists(results_save_path):
    os.makedirs(results_save_path)

### sweep procedure

# get number of runs to be executed for this sweep
n_runs = len(list(itertools.product(*fg_varied.values()))) * len(list(itertools.product(*model_varied.values()))) # (note: still works if one of the varied param dicts is empty, since itertools returns an empty tuple (which counts as a list element))
counter = 0 
print(f"Starting sweep '{sweep_name}' with {n_runs} runs total.")
  
# iterate over all combinations of fg_space_varied using itertools.product
for fg_params in itertools.product(*fg_varied.values()): # yields fg_varied value combinations
    for model_params in itertools.product(*model_varied.values()): # yields model_varied value combinations
        run_start_time = time.time()
        counter += 1
        # create new run name (random 6-character string)
        run_name = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), size=6, replace=True))

        # assemble feature & model params dicts for current run
        fg_config = fg_fixed.copy() # start with fixed params
        fg_config.update(dict(zip(fg_varied.keys(), fg_params))) # add varied params
        model_config = model_fixed.copy() 
        model_config.update(dict(zip(model_varied.keys(), model_params))) 

        ### generate features
        # update feature gen object with new params
        fg.set_params(new_params_dict=fg_config, run_name=run_name)
        # generate features (& labels)
        X_train, X_test, y_train, y_test = fg.generate_features(incl_non_feature_cols=False, print_logs=True) # logs false?

        ### create and train model
        # ...
        # model = modeller.train_xgb(X_train, X_test, params_dict=model_config) # <- sth like this

        ### evaluate model (i.e. predict test set and compute metric(s))
        # ...
        # preds = predictor.predict_proba(...)
        # ... evaluate (maybe third class and rename to train_pred_eval.py? idk...)

        ### save results
        # create results dict, containing all relevant info for current run (except the model itself)
        results_dict = {
            'run_name': run_name, # <- to be able to identify corresponding model and data prep objects later (if saved during the process)
            'fg_config': fg_config,
            'model_config': model_config,
            'train_test_split': {'train_size': len(X_train), 'test_size': len(X_test)}, # should be considered since different fg configs can yield differently sized train and test sets
            'task': None, # <- maybe description / type of task (classification/regression, etc.)
            'metrics': None # <- maybe dict of multiple metrics of performance on test set (accuracy, logloss, etc.)
        }
        # save
        with open(os.path.join(results_save_path, f"{sweep_name}_{counter}_{run_name}.pkl"), 'wb') as f:
            pkl.dump(results_dict, f)

        ### save model (separately from results dict)
        with open(os.path.join(model_save_path, f"{model.__class__.__name__}_{run_name}.pkl"), 'wb') as f:
            pkl.dump(results_dict, f)
        
        # print progress
        print(f"Run {run_name} ({counter}/{n_runs}) finished in {time.time() - run_start_time} seconds.\n")

Starting sweep 'xgb_coarse_grid_search_1' with 96 runs total.
************************************************************
Starting training feature generation (run_name: qkrove).
 - training data set loaded from db, shape: (21708, 159)
 - df shape after feature additions: (21708, 161)
 - number of h2h_ cols: 1
 - df shape after ma computation: (21708, 163)
 - df shape after encoding and dropping non-encoded categoricals: (21708, 156)
 - df shape after merge: (10854, 302)
 - n rows with any na after merge: 2948
 - df shape after dropping na rows over na threshold: (10854, 302)
 - X shape after feature/target split: (10854, 300)
 - X_train, X_test, y_train, y_test shapes after train/test split: (9027, 300), (1827, 300), (9027, 2), (1827, 2)
 - X_train, X_test, y_train, y_test shapes after final NA row drop: ((7641, 300), (1765, 300), (7641, 2), (1765, 2))
 - X_train, X_test shapes post scaling: ((7641, 300), (1765, 300))
 - X_train, X_test shapes post pca: ((7641, 128), (1765, 128))
Fea

NameError: name 'model' is not defined