In [17]:
import os
import sys

root_path = os.path.abspath(os.path.join('../..')) # <- adjust such that root_path always points at the root project dir (i.e. if current file is two folders deep, use '../..'). 
if root_path not in sys.path:
    sys.path.append(root_path)

import database_server.db_utilities as dbu
from cleaning.data_cleaning import DataCleaning
from models.trad_ml.feature_generation import FeatureGen

import pandas as pd
import numpy as np

In [18]:
# define feature params_dict
feature_params_dict = {
    'ma_alpha': 0.35, # the higher alpha, the more weight is put on recent observations vs. older observations
    'ma_min_periods': 0,
    'ma_restart_each_season': True,

    'h2h_feature_cols': ['result_score'], # list of columns of which h2h features should be generated
    'h2h_alpha': 0.35, # head2head feature EWMA alpha

    'min_non_na_share': 0.9,

    'merge_type': 'wide', # how should feature rows of two teams be combined? -> one of ['wide', 'diff_or_ratio']

    'apply_ohe': False, # True -> one-hot encode selected features, False -> drop all categorical features
    'ohe_name': None, # load fitted ohe from file <- must not be None when generating prediction features!

    'tt_split_cutoff_date': None, # cutoff date is the most recent date to be included in training set
    'tt_split_test_season': '2022-2023',

    'apply_scaler': True,
    'scaler_name': None, # load fitted scaler from file <- must not be None when generating prediction features!

    'apply_pca': True,
    'pca_name': None, # load fitted pca from file (provide filename without .pkl suffix) <- must not be None when generating prediction features!
    'pca_n_components': 0.98, # only relevant if not loading fitted pca

    'targets': ['gf', 'ga'], # one of [['gf', 'ga'], ['xg', 'xga']] or list of any single stat column.
    'target_as_diff': False # if True (and two target columns were specified), target is provided as difference between the two columns
}


In [19]:
# create instance
fg = FeatureGen(feature_params_dict)
#fg.load_data()
# just in this testing notebook: load data from csv (faster than db query)
#fg.db_full_data = pd.read_csv('db_full_data.csv', index_col=0)
#print(fg.db_full_data.shape)

In [20]:
# update params (also called in constructor of FeatureGen)
fg.set_params(feature_params_dict)

In [21]:
# training feature gen
X_train, X_test, y_train, y_test, nfc = fg.generate_features(incl_non_feature_cols=True, print_logs=True)

************************************************************
Starting training feature generation (run_name: zvvriu).


 - training data set loaded from db, shape: (21708, 159)
 - df shape after feature additions: (21708, 161)
 - number of h2h_ cols: 1
 - df shape after ma computation: (21708, 163)
 - df shape after encoding and dropping non-encoded categoricals: (21708, 156)
 - df shape after merge: (10854, 302)
 - n rows with any na after merge: 2948
 - df shape after dropping na rows over na threshold: (10854, 302)
 - X shape after feature/target split: (10854, 300)
 - X_train, X_test, y_train, y_test shapes after train/test split: (9027, 300), (1827, 300), (9027, 2), (1827, 2)
 - X_train, X_test, y_train, y_test shapes after final NA row drop: ((7641, 300), (1765, 300), (7641, 2), (1765, 2))
 - X_train, X_test shapes post scaling: ((7641, 300), (1765, 300))
 - X_train, X_test shapes post pca: ((7641, 164), (1765, 164))
Feature generation complete (run: zvvriu)


In [22]:
# test pred feature gen
X_pred, nfc = fg.generate_features(incl_non_feature_cols=True, home_team_id=142, away_team_id=143, print_logs=True)

************************************************************
Starting prediction feature generation (run_name: zvvriu).
 - prediction data set (home team id: 142, away team id: 143) filtered from full data set, shape: (374, 159)
 - df shape after feature additions: (374, 161)
 - number of h2h_ cols: 1
 - df shape after ma computation: (374, 163)
 - df shape after filtering for most recent feature rows: (2, 163)
 - df shape after encoding and dropping non-encoded categoricals: (2, 156)
 - df shape after merge: (1, 302)
 - n rows with any na after merge: 1
 - df shape after dropping na rows over na threshold: (1, 302)
 - X shape after feature/target split: (1, 300)
 - X shape post scaling: (1, 300)
 - X shape post pca: (1, 164)
Feature generation complete (run: zvvriu)


In [23]:
nfc

['schedule_date',
 'season_str',
 'league_id',
 'team_id',
 'opponent_id',
 'match_id']

In [24]:
X_pred

Unnamed: 0,schedule_date,season_str,league_id,team_id,opponent_id,match_id,0,1,2,3,...,148,149,150,151,152,153,154,155,156,157
0,,,,142,143,-1,8.432806,-0.244092,-0.55958,6.054282,...,-0.389422,-0.288284,-0.048904,-0.529675,-0.100874,0.842716,-0.168572,0.208106,0.268128,0.571674


In [25]:
X_train.head()

Unnamed: 0,schedule_date,season_str,league_id,team_id,opponent_id,match_id,0,1,2,3,...,148,149,150,151,152,153,154,155,156,157
0,2017-08-19,2017-2018,1,1,14,36,-0.484197,-3.573553,-6.450842,4.405964,...,0.333186,-1.392819,-0.573703,0.166484,-0.359142,-1.818897,0.186105,0.556133,1.891095,0.130279
1,2017-08-27,2017-2018,1,1,29,114,20.228267,3.538829,-5.933919,-1.945977,...,-0.423988,-0.58517,0.326511,0.392549,2.121723,-0.530028,-0.481979,-0.758697,0.675586,-0.090937
2,2017-09-16,2017-2018,1,1,5,184,-5.259397,-8.686431,-1.60397,-0.415826,...,-0.710084,-0.465227,0.33137,-0.174114,0.234759,0.782016,-0.402097,0.38102,0.341967,0.42903
3,2017-10-14,2017-2018,1,1,28,355,10.641385,-2.061179,-3.160838,-5.02468,...,0.896033,-0.441121,-0.518083,0.414847,0.430476,-0.623705,-0.705226,0.274121,-0.034198,0.254241
4,2017-10-28,2017-2018,1,1,21,464,-0.081084,-11.75153,-1.821951,-6.071949,...,0.06574,-0.395631,0.581169,0.101231,0.002327,-0.23551,-0.246484,0.618889,0.541525,-0.015456


In [26]:
y_train

Unnamed: 0,gf,ga
0,1,0
1,4,0
2,1,1
3,0,0
4,3,0
...,...,...
7636,3,2
7637,3,2
7638,0,2
7639,1,1
