In [16]:
import os
import sys

root_path = os.path.abspath(os.path.join('../..')) # <- adjust such that root_path always points at the root project dir (i.e. if current file is two folders deep, use '../..'). 
if root_path not in sys.path:
    sys.path.append(root_path)

import pandas as pd
import numpy as np
import datetime as dt
import itertools
import pickle as pkl
import time

# feature generation, model training, prediction & evaluation modules
import database_server.db_utilities as dbu
from models.trad_ml.feature_generation import FeatureGen
from models.trad_ml.training_prediction_evaluation import ModelTrainer
from models.trad_ml.training_prediction_evaluation import ModelPrediction
from models.trad_ml.training_prediction_evaluation import ModelEvaluation

## Hyperparameter Tuning 

We try to find optimal parameters for two steps of the modelling process:
1. Feature generation
    - Parameters are defined in the ``params`` dictionary of the ``FeatureGen`` class. 
    - Includes parameters for the moving average computations, data preparation steps (scaling, encoding, pca, NA value treatment) and more.
2. Model training
    - Parameters are defined in the ``params`` dictionary of the ``Training`` class.
    - Includes model parameters for the specific model.

### Types of Models

We include a range of different model types (which can accept our feature dataframe as input) in the optimization process. 

#### Linear Model
- Logistic Regression

#### Tree-based Models
- xgboost
- Random Forest

### Prediction Tasks

We train (and tune) models for the following prediction tasks:
- Number of goals scored by each team
- Win/Loss/Draw (can be derived from number of goals, or goal difference)


### Evaluation Metrics

We use two different metrics to evaluate the models:
- Logarithmic loss function
- Accuracy based on "argmax"

### Optimization Process: Description

Optimization is separated by model type and prediction task. For each combination of model type & prediction task, we perform a grid search over a range of parameter value combinations ('sweep'). Promising parameter combinations might be fine-tuned further in a second round.

Results for each iteration ('run') over the parameter combinations are saved in dictionaries in a subfolder for the current sweep. (Saving each run individually ensures we don't lose intermediate results if the optimization process is interrupted.)

### Optimization Process: Implementation

#### 1. Logreg Model Predection 

In [29]:
### 1) feature generation search space

# parameters to be varied
# example values
fg_varied = {
    'ma_alpha': [0.05], # the higher alpha, the more weight is put on recent observations vs. older observations
    'ma_min_periods': [10], # note: we would strongly prefer low / no min_periods here (to enable predictions also for teams with only few matches in our database)
    'ma_restart_each_season': [False], # we would prefer False here, see above

    'h2h_feature_cols': ['result_score'], # list of columns of which h2h features should be generated
    'h2h_alpha': [0.1], # head2head feature EWMA alpha
    'pca_n_components':[0.925 ]  # only relevant when fitting new pca (note: n_components can be a fraction between zero and one, in which case the number of components is determined via the explained variance threshold)

}
# fixed parameters
# example values
fg_fixed = {
    
    'min_non_na_share': 0.9,
    
    'merge_type': 'wide', # how should feature rows of two teams be combined? -> one of ['wide', 'diff_or_ratio']

    'apply_ohe': False, # True -> one-hot encode selected features, False -> drop all categorical features
    'ohe_name': None, # load fitted ohe from file <- must not be None when generating prediction features!

    'tt_split_cutoff_date': None, # cutoff date is the most recent date to be included in training set. Format: pd.to_datetime('yyyy-mm-dd').date()
    'tt_split_test_season': '2022-2023', # season in format yyyy-yyyy

    'apply_scaler': True,
    'scaler_name': None, # load fitted scaler from file <- must not be None when generating prediction features!
    'apply_pca': True,
    
    'pca_name': None, # load fitted pca from file (provide filename without .pkl suffix) <- must not be None when generating prediction features!

    'targets': ['gf', 'ga'], # one of [['gf', 'ga'], ['xg', 'xga']] or list of any single stat column.
    'target_as_diff': False # if True (and two target columns were specified), target is provided as difference between the two columns
}

### 2) model training search space: logreg params

# logreg params to be varied
model_varied = {'max_iter': [20],
                'C':[0.001],
                'dif': [False],
                'class_weight' : [None]

}
# logreg params to be fixed
model_fixed = {}

### instantiate feature gen and model training/predicting/evaluation objects

fg = FeatureGen(params_dict={**fg_fixed, **fg_varied}) # note: loads full data set from db during first feature gen run

modeller = ModelTrainer()

predictor = ModelPrediction()

evaluator = ModelEvaluation() 


# optimization sweep name
sweep_name = 'logreg_example' ### <- CHANGE FOR EVERY NEW SWEEP (will create directory)
# should models be saved during the sweep (or just the result dicts)?
save_models = True

In [32]:
# model save path (same for all sweeps/runs)
model_save_path = os.path.join(root_path, 'models', 'trad_ml', 'saved_models')
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# define path for sweep results (new folder for each sweep)
results_save_path = os.path.join(root_path, 'models', 'trad_ml', 'sweep_results', sweep_name)
if not os.path.exists(results_save_path):
    os.makedirs(results_save_path)

### sweep procedure

# get number of runs to be executed for this sweep
n_runs = len(list(itertools.product(*fg_varied.values()))) * len(list(itertools.product(*model_varied.values()))) # (note: still works if one of the varied param dicts is empty, since itertools returns an empty tuple (which counts as a list element))
counter = 0 
print(f"Starting sweep '{sweep_name}' with {n_runs} runs total.")
  
# iterate over all combinations of fg_space_varied using itertools.product
for fg_params in itertools.product(*fg_varied.values()): # yields fg_varied value combinations
    for model_params in itertools.product(*model_varied.values()): # yields model_varied value combinations
        run_start_time = time.time()
        counter += 1
        # create new run name (random 6-character string)
        run_name = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), size=6, replace=True))

        # assemble feature & model params dicts for current run
        fg_config = fg_fixed.copy() # start with fixed params
        fg_config.update(dict(zip(fg_varied.keys(), fg_params))) # add varied params
        model_config = model_fixed.copy() 
        model_config.update(dict(zip(model_varied.keys(), model_params))) 

        ### generate features
        # update feature gen object with new params
        fg.set_params(new_params_dict=fg_config, run_name=run_name)
        # generate features (& labels)
        X_train, X_test, y_train, y_test = fg.generate_features(incl_non_feature_cols=False, print_logs=True) # logs false?

        ### create and train model
        model = modeller.train_logreg(X_train, y_train, **model_config) 

        ### evaluate model 
        preds = predictor.predict_prob(X_test, model, dif = model_config['dif'] )
        accuracy = evaluator.accuracy(y_test, preds)
        lnloss = evaluator.lnloss(y_test,preds)
        
        ### save results
        # create results dict, containing all relevant info for current run (except the model itself)
        results_dict = {
            'run_name': run_name, # <- to be able to identify corresponding model and data prep objects later (if saved during the process)
            'fg_config': fg_config,
            'model_config': model_config,
            'train_test_split': {'train_size': len(X_train), 'test_size': len(X_test)}, # should be considered since different fg configs can yield differently sized train and test sets
            'task': f"targets: {fg_config['targets']}, predicting diff: {model_config['dif']}",
            'metrics': {'accuracy': accuracy,
                      'lnloss': lnloss} # dict of multiple metrics of performance on test set (accuracy, logloss)
        }
        # save
        with open(os.path.join(results_save_path, f"{sweep_name}_{counter}_{run_name}.pkl"), 'wb') as f:
            pkl.dump(results_dict, f)

        ### save model (separately from results dict)
        if save_models:
            with open(os.path.join(model_save_path, f"{model.__class__.__name__}_{run_name}.pkl"), 'wb') as f:
                pkl.dump(model, f)
        
        # print paramters of iteration
        print(f"Run {run_name} used {fg_config} and {model_config}")

        # print progress
        print(f"Run {run_name} ({counter}/{n_runs}) finished in {time.time() - run_start_time} seconds.\n")

Starting sweep 'logreg_example' with 1 runs total.
************************************************************
Starting training feature generation (run_name: mbywna).
 - df shape after feature additions: (21708, 161)
 - number of h2h_ cols: 1
 - df shape after ma computation: (21708, 163)
 - df shape after encoding and dropping non-encoded categoricals: (21708, 156)
 - df shape after merge: (10854, 302)
 - n rows with any na after merge: 2028
 - df shape after dropping na rows over na threshold: (10854, 302)
 - X shape after feature/target split: (10854, 300)
 - X_train, X_test, y_train, y_test shapes after train/test split: (9027, 300), (1827, 300), (9027, 2), (1827, 2)
 - X_train, X_test, y_train, y_test shapes after final NA row drop: ((7701, 300), (1759, 300), (7701, 2), (1759, 2))
 - X_train, X_test shapes post scaling: ((7701, 300), (1759, 300))
 - X_train, X_test shapes post pca: ((7701, 84), (1759, 84))
Feature generation complete (run: mbywna)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Run mbywna used {'min_non_na_share': 0.9, 'merge_type': 'wide', 'apply_ohe': False, 'ohe_name': None, 'tt_split_cutoff_date': None, 'tt_split_test_season': '2022-2023', 'apply_scaler': True, 'scaler_name': None, 'apply_pca': True, 'pca_name': None, 'targets': ['gf', 'ga'], 'target_as_diff': False, 'ma_alpha': 0.05, 'ma_min_periods': 10, 'ma_restart_each_season': False, 'h2h_feature_cols': 'result_score', 'h2h_alpha': 0.1, 'pca_n_components': 0.925} and {'max_iter': 20, 'C': 0.001, 'dif': False, 'class_weight': None}
Run mbywna (1/1) finished in 11.695378541946411 seconds.



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
import glob
import pickle as pkl
import pandas as pd

# Get a list of all .pkl files in the directory
pkl_files = glob.glob(os.path.join(results_save_path, "*.pkl"))

# Initialize an empty list to store the data from each file
data = []

# Load and extract the data from each pickle file
for file in pkl_files:
    with open(file, 'rb') as f:
        obj = pkl.load(f)
        data.append({
            'name': obj['run_name'],
            'ma_alpha': obj['fg_config']['ma_alpha'],
            'h2h_alpha': obj['fg_config']['h2h_alpha'],
            'ma_min_periods': obj['fg_config']['ma_min_periods'],
            'ma_restart_each_season': obj['fg_config']['ma_restart_each_season'],
            'h2h_feature_cols':obj['fg_config']['h2h_feature_cols'],
            'max_iter':obj['model_config']['max_iter'],
            'C':obj['model_config']['C'],
            'accuracy': obj['metrics']['accuracy'],
            'lnloss': obj['metrics']['lnloss'],
            'dif': obj['model_config']['dif'],
            'pca_n_components': obj['fg_config']['pca_n_components'],
            'class_weight': obj['model_config']['class_weight']
        })

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Print the DataFrame
print(df)


     name  ma_alpha  h2h_alpha  ma_min_periods  ma_restart_each_season  \
0  mbywna      0.05        0.1              10                   False   

  h2h_feature_cols  max_iter      C  accuracy    lnloss    dif  \
0     result_score        20  0.001  0.528709  0.989105  False   

   pca_n_components class_weight  
0             0.925         None  


In [35]:
# Find the row with the highest accuracy respectivly lowest accuracy
max_accuracy_row = df.loc[df['accuracy'].idxmax()]
min_lnloss_row = df.loc[df['lnloss'].idxmin()]
print(max_accuracy_row)
print(min_lnloss_row)

name                            mbywna
ma_alpha                          0.05
h2h_alpha                          0.1
ma_min_periods                      10
ma_restart_each_season           False
h2h_feature_cols          result_score
max_iter                            20
C                                0.001
accuracy                      0.528709
lnloss                        0.989105
dif                              False
pca_n_components                 0.925
class_weight                      None
Name: 0, dtype: object
name                            mbywna
ma_alpha                          0.05
h2h_alpha                          0.1
ma_min_periods                      10
ma_restart_each_season           False
h2h_feature_cols          result_score
max_iter                            20
C                                0.001
accuracy                      0.528709
lnloss                        0.989105
dif                              False
pca_n_components                 0.925
cl

In [36]:
# group by parameter of choice and print mean of lnloss or accuracy
# example: grouped_df_pca_components = df.groupby('pca_components')['lnloss'].mean()
#          print(grouped_df_pca_components)
