# Datasets Benchmark

**Summary of this Article** 
- Loading best hyperparameters for each model
- Model training
- Model evaluation
- Results discussion


## Loading best hyperparameters for each model

As explained in another notebook, the hyperparameters for each model were tunnned using the Optuna library. For each dataset and model, the hyperparameters have different values. The values for each hyperparameters are seen bellow.   


In [13]:
# Import hyperparameters dataset.
import os 
import pandas as pd

In [29]:
sparse_hyper_params = {}
focused_hyper_params = {}
boolean_hyper_params = {}
for file in os.listdir('hyper_params_results'):
    if file.endswith('.csv') and 'sparse.csv' in file.split('_') and 'classifier' not in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        sparse_hyper_params[file] = df
    elif file.endswith('.csv') and 'focused.csv' in file.split('_') and 'classifier' not in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        focused_hyper_params[file] = df
    elif file.endswith('.csv') and 'classifier' in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        boolean_hyper_params[file] = df
print('Sparse hyper params:\n')
for key in sparse_hyper_params.keys():
    print(key, ':\n ',sparse_hyper_params[key])
print('Focused hyper params:\n')
for key in focused_hyper_params.keys():
    print(key, ':\n',focused_hyper_params[key])
print('Boolean hyper params:\n')
for key in boolean_hyper_params.keys():
    print(key, ':\n',boolean_hyper_params[key])

Sparse hyper params:

params_gradient_boost_regression_sparse.csv :
            params                 value
0   n_estimators                   727
1  learning_rate     0.347390412877283
2           loss         squared_error
3          value  0.006486628478827696
params_support_vector_regression_sparse.csv :
     params                  value
0  kernel                    rbf
1       C   0.013225080938087181
2  degree                      5
3   gamma  7.229703411231879e-07
4   value   0.005822757995746922
params_xgboost_regression_sparse.csv :
               params                   value
0           booster                gblinear
1            lambda  1.4617144209122512e-06
2             alpha     0.09865530031009674
3         subsample      0.3609716863271228
4  colsample_bytree      0.6484747255416106
5             value                     0.0
Focused hyper params:

params_gradient_boost_regression_focused.csv :
           params                 value
0   n_estimators              

In [44]:
def get_hyper_params_from_df(df):
    output = {}
    for row in df.iterrows():
        output[row[1]['params']] = row[1]['value']
    return output

## Loading the data

In [47]:
import sys
sys.path.append('..')
from thesis_package import aimodels as my_ai, utils, metrics

import sklearn.metrics
from sklearn.model_selection import train_test_split

exogenous_data = pd.read_csv('..\data\processed\production\exogenous_data_extended.csv').drop(columns=['date'])
# Regression data sparse
y_max_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_constr.csv').drop(columns=['timestamps'])
y_min_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_constr.csv').drop(columns=['timestamps'])

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u_sparse)
data_max_u_sparse = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u_sparse, scaling=True)
data_max_u_scaled_sparse = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u_sparse)
data_min_u_sparse = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u_sparse, scaling=True)
data_min_u_scaled_sparse = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

# Regresison data focused
y_max_u_focused = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_bal_constr.csv').drop(columns=['timestamps'])
y_min_u_focused = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_bal_constr.csv').drop(columns=['timestamps'])

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u_focused)
data_max_u_focused = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u_focused, scaling=True)
data_max_u_scaled_focused = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u_focused)
data_min_u_focused = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u_focused, scaling=True)
data_min_u_scaled_focused = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

# Classification data
y_max_u = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_bool_constr.csv').drop(columns=['timestamps'])
y_min_u = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_bool_constr.csv').drop(columns=['timestamps'])
train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u)
data_max_u_bool = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}
train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_max_u, scaling=True)
data_max_u_bool_scaled = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}
train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u)
data_min_u_bool = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}
train_x, test_x, train_y, test_y = utils.split_and_suffle(exogenous_data, y_min_u, scaling=True)
data_min_u_bool_scaled = {'X_train': train_x, 'X_test': test_x, 'y_train': train_y, 'y_test': test_y}

## Training models
In this section the models will be trained with the hyperparameters loaded above. All the models will be stored in the same `Context` object for later evaluation. The `Context` object is a class that stores all the models and their respective hyperparameters. The `Context` object is defined in the `aimodels.py` file. The `Context` object is defined as follows:

### Max Voltage

In [None]:
# max_u regression sparse
if 'max_u_regressor_sparse.pickle' not in os.listdir('pickles\dataset_benchmark'):
    # Linear Regression
    regressor_max_u = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_max_u.fit(data=data_max_u_sparse)
    # Gradient Boost Regression
    hyper_params = {}
    regressor_max_u.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    # Extreme GBoost Regression
    hyper_params = {}
    regressor_max_u.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    # Support Vector Regression
    hyper_params = {}
    regressor_max_u.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_scaled_sparse)
    utils.serialize_object('pickles\dataset_benchmark\max_u_regressor_spare', regressor_max_u)
else: 
    regressor_max_u = utils.deserialize_object('pickles\dataset_benchmark\regressor_max_u_sparse')
# Linear Regression
prediction_lr_max_u = regressor_max_u.strategies[0].predict(data=data_max_u_sparse)
prediction_lr_max_u = pd.DataFrame(prediction_lr_max_u , columns=y_max_u.columns)
# Gradient Boost Regression
prediction_gb_max_u =  regressor_max_u.strategies[1].predict(data=data_max_u_sparse)
prediction_gb_max_u = pd.DataFrame(prediction_gb_max_u, columns=y_max_u.columns)
# Extreme GBoost Regression
prediction_xgb_max_u =  regressor_max_u.strategies[2].predict(data=data_max_u_sparse)
prediction_xgb_max_u = pd.DataFrame(prediction_xgb_max_u, columns=y_max_u.columns)
# Support Vector Regression
prediction_svr_max_u =  regressor_max_u.strategies[3].predict(data=data_max_u_scaled_sparse)
prediction_svr_max_u = pd.DataFrame(prediction_svr_max_u, columns=y_max_u.columns)

In [None]:
# max_u classification
if 'max_u_classifier.pickle' not in os.listdir('pickles\dataset_benchmark\dataset_benchmark'):
    # Gradient Boost Classifier
    hyper_params = {}
    classifier_max_u = my_ai.Context(strategy=my_ai.GradientBoostClassifierStrategy(hyper_params))
    classifier_max_u.fit(data=data_max_u_bool)
    # Extreme GBoost Classifier
    hyper_params = {}
    classifier_max_u.strategy = my_ai.XGBoostClassifierStrategy(hyper_params)
    classifier_max_u.fit(data=data_max_u_bool)
    # Support Vector Classifier
    hyper_params = {}
    classifier_max_u.strategy = my_ai.SupportVectorClassifierStrategy(hyper_params)
    classifier_max_u.fit(data=data_max_u_bool_scaled)
    utils.serialize_object('pickles\dataset_benchmark\max_u_classifier', classifier_max_u)
else: 
    classifier_max_u = utils.deserialize_object('pickles\dataset_benchmark\classifier_max_u')
# Gradient Boost Classifier
prediction_gb_max_u = classifier_max_u.strategies[0].predict(data=data_max_u_bool)
prediction_gb_max_u = pd.DataFrame(prediction_gb_max_u, columns=y_max_u.columns)
# Extreme GBoost Classifier
prediction_xgb_max_u = classifier_max_u.strategies[1].predict(data=data_max_u_bool)
prediction_xgb_max_u = pd.DataFrame(prediction_xgb_max_u, columns=y_max_u.columns)
# Support Vector Classifier
prediction_svr_max_u = classifier_max_u.strategies[2].predict(data=data_max_u_bool_scaled)
prediction_svr_max_u = pd.DataFrame(prediction_svr_max_u, columns=y_max_u.columns)

### Min u regression training


In [None]:
# min_u regression sparse
if 'min_u_regressor_sparse.pickle' not in os.listdir('pickles\dataset_benchmark'):
    # Linear Regression
    regressor_min_u = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_min_u.fit(data=data_min_u_sparse)
    # Gradient Boost Regression
    hyper_params = {}
    regressor_min_u.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_sparse)
    # Extreme GBoost Regression
    hyper_params = {}
    regressor_min_u.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_sparse)
    # Support Vector Regression
    hyper_params = {}
    regressor_min_u.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_scaled_sparse)
    utils.serialize_object('pickles\dataset_benchmark\min_u_regressor_sparse', regressor_min_u)
else:
    regressor_min_u = utils.deserialize_object('pickles\dataset_benchmark\min_u_regressor_sparse')
# Linear Regression
prediction_lr_min_u = regressor_min_u.strategies[0].predict(data=data_min_u_sparse)
prediction_lr_min_u = pd.DataFrame(prediction_lr_min_u , columns=y_min_u.columns)
# Gradient Boost Regression
prediction_gb_min_u =  regressor_min_u.strategies[1].predict(data=data_min_u_sparse)
prediction_gb_min_u = pd.DataFrame(prediction_gb_min_u, columns=y_min_u.columns)
# Extreme GBoost Regression
prediction_xgb_min_u =  regressor_min_u.strategies[2].predict(data=data_min_u_sparse)
prediction_xgb_min_u = pd.DataFrame(prediction_xgb_min_u, columns=y_min_u.columns)
# Support Vector Regression
prediction_svr_min_u =  regressor_min_u.strategies[3].predict(data=data_min_u_scaled_sparse)
prediction_svr_min_u = pd.DataFrame(prediction_svr_min_u, columns=y_min_u.columns)


In [None]:

# min_u classification
if 'min_u_classifier.pickle' not in os.listdir('pickles\dataset_benchmark'):
    # Gradient Boost Classifier
    hyper_params = {}
    classifier_min_u = my_ai.Context(strategy=my_ai.GradientBoostClassifierStrategy(hyper_params))
    classifier_min_u.fit(data=data_min_u_bool)
    # Extreme GBoost Classifier
    hyper_params = {}
    classifier_min_u.strategy = my_ai.XGBoostClassifierStrategy(hyper_params)
    classifier_min_u.fit(data=data_min_u_bool)
    # Support Vector Classifier
    hyper_params = {}
    classifier_min_u.strategy = my_ai.SupportVectorClassifierStrategy(hyper_params)
    classifier_min_u.fit(data=data_min_u_bool_scaled)
    utils.serialize_object('pickles\dataset_benchmark\min_u_classifier', classifier_min_u)
else: 
    classifier_min_u = utils.deserialize_object('pickles\dataset_benchmark\classifier_min_u')
# Gradient Boost Classifier
prediction_gb_min_u = classifier_min_u.strategies[0].predict(data=data_min_u_bool)
prediction_gb_min_u = pd.DataFrame(prediction_gb_min_u, columns=y_min_u.columns)
# Extreme GBoost Classifier
prediction_xgb_min_u = classifier_min_u.strategies[1].predict(data=data_min_u_bool)
prediction_xgb_min_u = pd.DataFrame(prediction_xgb_min_u, columns=y_min_u.columns)
# Support Vector Classifier 
prediction_svr_min_u = classifier_min_u.strategies[2].predict(data=data_min_u_bool_scaled)
prediction_svr_min_u = pd.DataFrame(prediction_svr_min_u, columns=y_min_u.columns)