# Datasets Benchmark

**Summary of this Article** 
- Loading best hyperparameters for each model
- Model training
- Results discussion


## Loading best hyperparameters for each model

TODO... explain this model bench mark

In [1]:
# Import hyperparameters dataset.
import os 
import pandas as pd

In [2]:
sparse_hyper_params = {}
focused_hyper_params = {}
boolean_hyper_params = {}
for file in os.listdir('hyper_params_results'):
    if file.endswith('.csv') and 'sparse' in file.split('_') and 'classifier' not in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        sparse_hyper_params[file] = df
    elif file.endswith('.csv') and 'focused' in file.split('_') and 'classifier' not in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        focused_hyper_params[file] = df
    elif file.endswith('.csv') and 'balanced' in file.split('_') and 'classifier' not in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        focused_hyper_params[file] = df
    elif file.endswith('.csv') and 'classifier' in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        boolean_hyper_params[file] = df
print('Sparse hyper params:\n')
for key in sparse_hyper_params.keys():
    print(key, ':\n ',sparse_hyper_params[key])
print('Focused hyper params:\n')
for key in focused_hyper_params.keys():
    print(key, ':\n',focused_hyper_params[key])
print('Boolean hyper params:\n')
for key in boolean_hyper_params.keys():
    print(key, ':\n',boolean_hyper_params[key])

Sparse hyper params:

params_gradient_boost_regression_sparse_max_u.csv :
            params                value
0   n_estimators                   19
1  learning_rate  0.12236030175251943
2           loss        squared_error
3          value   0.5471179901945081
params_gradient_boost_regression_sparse_min_u.csv :
            params                value
0   n_estimators                  119
1  learning_rate  0.17057256340998259
2           loss        squared_error
3          value   0.5605410379664836
params_mlp_regression_sparse_max_u.csv :
          params                value
0  hidden_size                   31
1     n_layers                    2
2      dropout  0.10137160649535842
3   activation              sigmoid
4    optimizer                  sgd
5           lr  0.04276419439103241
6       epochs                   60
7   batch_size                   32
8   classifier                False
9        value   0.9920083183694078
params_support_vector_regression_sparse_max_u.csv :

In [3]:
import ast
def get_hyper_params_from_df(df):
    output = {}
    for row in df.iterrows():
        if row[1]['params'] != 'value':
            try:
                output[row[1]['params']] = ast.literal_eval(row[1]['value'])
            except :
                output[row[1]['params']] = row[1]['value']
    return output
get_hyper_params_from_df(focused_hyper_params['params_mlp_regression_focused_max_u.csv'])

{'hidden_size': 34,
 'n_layers': 3,
 'dropout': 0.0030412321477918842,
 'activation': 'relu',
 'optimizer': 'sgd',
 'lr': 9.741292351005151e-05,
 'epochs': 55,
 'batch_size': 8,
 'classifier': False}

## Loading the data

In [4]:
import sys
sys.path.append('..')
from thesis_package import aimodels as my_ai, utils, metrics
from copy import deepcopy
import sklearn.metrics
from sklearn.model_selection import train_test_split

exogenous_data = pd.read_csv('..\data\processed\production\exogenous_data_extended.csv').drop(columns=['date'])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Regression data sparse
y_max_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_constr.csv').drop(columns=['timestamps'])
y_min_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_constr.csv').drop(columns=['timestamps'])

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_max_u_sparse, test_size=0.2, scaling=True)
data_max_u_sparse = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_min_u_sparse, test_size=0.2, scaling=True)
data_min_u_sparse = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [6]:
# Classification data
y_max_u_bool = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_bool_constr.csv').drop(columns=['timestamps'])
y_min_u_bool = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_bool_constr.csv').drop(columns=['timestamps'])
y_max_u_bool = y_max_u_bool[utils.cols_with_positive_values(y_max_u_bool)]
y_min_u_bool = y_min_u_bool[utils.cols_with_positive_values(y_min_u_bool)]

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_max_u_bool, test_size=0.2, scaling=True)
data_max_u_bool = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_min_u_bool, test_size=0.2, scaling=True)
data_min_u_bool = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [7]:
# Filtered data
y_max_u_filtered = deepcopy(y_max_u_sparse[utils.cols_with_positive_values(y_max_u_bool)])
y_min_u_filtered = deepcopy(y_min_u_sparse[utils.cols_with_positive_values(y_min_u_bool)])

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_max_u_filtered, test_size=0.2, scaling=True)
data_max_u_filtered = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_min_u_filtered, test_size=0.2, scaling=True)
data_min_u_filtered = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [8]:
# Print the size of the classiciation testing data and the filtered testing data
print('Classification data size: ', data_max_u_bool['y_test'].shape)
print('Regression data size: ', data_max_u_filtered['y_test'].shape)
print('Positive in classification data: ', utils.count_positives_class(data_max_u_bool['y_test']))
#unscaled_y_test = pd.DataFrame(data_max_u_filtered['scaler']['y'].inverse_transform(data_max_u_filtered['y_test']), columns=data_max_u_filtered['y_test'].columns)
unscaled_y_test = utils.unscale_df(data_max_u_filtered['y_test'], data_max_u_filtered['scaler']['y'])
print('Positive in regression data: ', utils.count_positives_reg(unscaled_y_test, utils.compute_threshold(y_max_u_sparse)))
print('Theshhold: ', utils.compute_threshold(y_max_u_sparse))

Classification data size:  (9044, 10)
Regression data size:  (9044, 10)
Positive in classification data:  5036.0
Positive in regression data:  5036
Theshhold:  0.001591058368850724


In [None]:
# Regresison data focused
y_max_u_focused = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_focused_constr.csv')
exogenous_data_focused_max_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_max_focused.csv')
y_min_u_focused = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_focused_constr.csv')
exogenous_data_focused_min_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_min_focused.csv')

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_focused_max_u, y_max_u_focused, test_size=0.2, scaling=True)
data_max_u_focused = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_focused_min_u, y_min_u_focused, test_size=0.2, scaling=True)
data_min_u_focused = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [None]:
# Regresison data balanced
y_max_u_balanced = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_balanced_constr.csv')
exogenous_data_balanced_max_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_max_balanced.csv').drop(columns=['date'])
y_min_u_balanced = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_balanced_constr.csv')
exogenous_data_balanced_min_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_min_balanced.csv').drop(columns=['date'])

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_max_u, y_max_u_balanced, test_size=0.2, scaling=True)
data_max_u_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_min_u, y_min_u_balanced, test_size=0.2, scaling=True)
data_min_u_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

Now for a quick sanity check:

In [10]:
utils.check_positive_count(utils.unscale_df(data_max_u_filtered['y_test'], data_max_u_filtered['scaler']['y']), data_max_u_bool['y_test'], utils.compute_threshold(y_max_u_sparse), experiment='max_u')
utils.check_positive_count(utils.unscale_df(data_min_u_filtered['y_test'], data_min_u_filtered['scaler']['y']), data_min_u_bool['y_test'], utils.compute_threshold(y_min_u_sparse), experiment='min_u')
utils.check_negative_count(utils.unscale_df(data_max_u_filtered['y_test'], data_max_u_filtered['scaler']['y']), data_max_u_bool['y_test'], utils.compute_threshold(y_max_u_sparse), experiment='max_u')
utils.check_negative_count(utils.unscale_df(data_min_u_filtered['y_test'], data_min_u_filtered['scaler']['y']), data_min_u_bool['y_test'], utils.compute_threshold(y_min_u_sparse), experiment='min_u')

Positive count in classification data max_u : 5036.0
Positive count in regression data max_u with threshold 0.001591058368850724 : 5036


Positive count in classification data min_u : 6018.0
Positive count in regression data min_u with threshold 0.0020242378560612192 : 6018


Negative count in classification data max_u : 85404.0
Negative count in regression data max_u with threshold 0.001591058368850724 : 85404


Negative count in classification data min_u : 84422.0
Negative count in regression data min_u with threshold 0.0020242378560612192 : 84422




## Training models
In this section the models will be trained with the hyperparameters loaded above. All the models will be stored in the same `Context` object for later evaluation. The `Context` object is a class that stores all the models and their respective hyperparameters. The `Context` object is defined in the `aimodels.py` file. The `Context` object is defined as follows:

In [11]:
models = ['lr', 'gb', 'xgb', 'svr', 'mlp']

### Max Voltage

In [12]:
sparse_hyper_params.keys()

dict_keys(['params_gradient_boost_regression_sparse_max_u.csv', 'params_gradient_boost_regression_sparse_min_u.csv', 'params_mlp_regression_sparse_max_u.csv', 'params_support_vector_regression_sparse_max_u.csv', 'params_support_vector_regression_sparse_min_u.csv', 'params_xgboost_regression_sparse_max_u.csv', 'params_xgboost_regression_sparse_min_u.csv'])

In [None]:
# max_u regression sparse
if 'max_u_regressor_sparse.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training max_u regression sparse')
    # Linear Regression
    regressor_max_u = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_max_u.fit(data=data_max_u_sparse)
    # Gradient Boost Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_gradient_boost_regression_sparse_max_u.csv'])
    regressor_max_u.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    # Extreme GBoost Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_xgboost_regression_sparse_max_u.csv']) 
    regressor_max_u.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    # Support Vector Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_support_vector_regression_sparse_max_u.csv'])
    regressor_max_u.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    # MLP Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_mlp_regressor_sparse_max_u.csv'])
    hyper_params['input_size'] = data_max_u_sparse['X_train'].shape[1]
    hyper_params['output_size'] = data_max_u_sparse['y_train'].shape[1]
    regressor_max_u.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u.fit(data=data_max_u_sparse)
    utils.serialize_object('pickles\dataset_benchmark\max_u_regressor_sparse', regressor_max_u)
else:
    print('Loading max_u regression sparse') 
    regressor_max_u = utils.deserialize_object('pickles\dataset_benchmark\max_u_regressor_sparse')

testing_data = {'max_u_regressor_sparse': {}}
for model, strategy in zip(models, regressor_max_u.strategies):
    prediction = strategy.predict(data=data_max_u_sparse)
    prediction = pd.DataFrame(prediction, columns=data_max_u_sparse['y_test'].columns)
    testing_data['max_u_regressor_sparse'][model] = {'real': None, 'predicted': None}
    testing_data['max_u_regressor_sparse'][model]['predicted'] = deepcopy(prediction)
    testing_data['max_u_regressor_sparse'][model]['real'] = deepcopy(data_max_u_sparse['y_test'])

In [None]:
# max_u regression focused
if 'max_u_regressor_focused.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training max_u regression focused')
    # Linear Regression
    regressor_max_u_focused = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_max_u_focused.fit(data=data_max_u_focused)
    # Gradient Boost Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_gradient_boost_regression_focused_max_u.csv'])
    regressor_max_u_focused.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_max_u_focused)
    # Extreme GBoost Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_xgboost_regression_focused_max_u.csv']) 
    regressor_max_u_focused.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_max_u_focused)
    # Support Vector Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_support_vector_regression_focused_max_u.csv'])
    regressor_max_u_focused.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_max_u_focused)
    # MLP Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_mlp_regressor_focused_max_u.csv'])
    hyper_params['input_size'] = data_max_u_focused['X_train'].shape[1]
    hyper_params['output_size'] = data_max_u_focused['y_train'].shape[1]
    regressor_max_u_focused.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_max_u_focused)
    utils.serialize_object('pickles\dataset_benchmark\max_u_regressor_focused', regressor_max_u_focused)
else: 
    print('Loading max_u regression focused')
    regressor_max_u_focused = utils.deserialize_object('pickles\dataset_benchmark\\max_u_regressor_focused')

testing_data['max_u_regressor_focused'] = {}
for model, strategy in zip(models, regressor_max_u_focused.strategies):
    prediction = strategy.predict(data=data_max_u_sparse)
    prediction = pd.DataFrame(prediction, columns=data_max_u_sparse['y_test'].columns)
    testing_data['max_u_regressor_focused'][model] = {'real': None, 'predicted': None}
    testing_data['max_u_regressor_focused'][model]['predicted'] = deepcopy(prediction)
    testing_data['max_u_regressor_focused'][model]['real'] = deepcopy(data_max_u_sparse['y_test'])

In [None]:
# max_u classification
if 'max_u_classifier.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training max_u classification')
    # Gradient Boost Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_gradient_boost_classifier_max_u.csv'])
    classifier_max_u = my_ai.Context(strategy=my_ai.GradientBoostClassifierStrategy(hyper_params))
    classifier_max_u.fit(data=data_max_u_bool)
    # Extreme GBoost Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_xgboost_classifier_max_u.csv'])
    classifier_max_u.strategy = my_ai.XGBoostClassifierStrategy(hyper_params)
    classifier_max_u.fit(data=data_max_u_bool)
    # Support Vector Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_support_vector_classifier_max_u.csv'])
    classifier_max_u.strategy = my_ai.SupportVectorClassifierStrategy(hyper_params)
    classifier_max_u.fit(data=data_max_u_bool)
    # MLP Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_mlp_classifier_max_u.csv'])
    hyper_params['input_size'] = data_max_u_bool['X_train'].shape[1]
    hyper_params['output_size'] = data_max_u_bool['y_train'].shape[1]
    regressor_max_u_focused.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_max_u_bool)
    utils.serialize_object('pickles\dataset_benchmark\max_u_classifier', classifier_max_u)
else: 
    print('Loading max_u classification')
    classifier_max_u = utils.deserialize_object('pickles\dataset_benchmark\max_u_classifier')

testing_data['max_u_classifier'] = {}
for model, strategy in zip(models, classifier_max_u.strategies):
    prediction = strategy.predict(data=data_max_u_bool)
    prediction = pd.DataFrame(prediction, columns=data_max_u_bool['y_test'].columns)
    testing_data['max_u_classifier'][model] = {'real': None, 'predicted': None}
    testing_data['max_u_classifier'][model]['predicted'] = deepcopy(prediction)
    testing_data['max_u_classifier'][model]['real'] = deepcopy(data_max_u_bool['y_test'])

### Min u regression training


In [None]:
# min_u regression sparse
if 'min_u_regressor_sparse.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training min_u regression sparse')
    # Linear Regression
    regressor_min_u = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_min_u.fit(data=data_min_u_sparse)
    # Gradient Boost Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_gradient_boost_regression_sparse_min_u.csv'])
    regressor_min_u.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_sparse)
    # Extreme GBoost Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_xgboost_regression_sparse_min_u.csv'])
    regressor_min_u.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_sparse)
    # Support Vector Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_support_vector_regression_sparse_min_u.csv'])
    regressor_min_u.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_min_u.fit(data=data_min_u_sparse)
    # MLP Regression
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_mlp_regressor_sparse_min_u.csv'])
    hyper_params['input_size'] = data_min_u_sparse['X_train'].shape[1]
    hyper_params['output_size'] = data_min_u_sparse['y_train'].shape[1]
    regressor_max_u_focused.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_min_u_sparse)
    utils.serialize_object('pickles\dataset_benchmark\min_u_regressor_sparse', regressor_min_u)
else:
    print('Loading min_u regression sparse')
    regressor_min_u = utils.deserialize_object('pickles\dataset_benchmark\min_u_regressor_sparse')

testing_data['min_u_regressor_sparse'] = {}
for model, strategy in zip(models, regressor_min_u.strategies):
    prediction = strategy.predict(data=data_min_u_sparse)
    prediction = pd.DataFrame(prediction, columns=data_min_u_sparse['y_test'].columns)
    testing_data['min_u_regressor_sparse'][model] = {'real': None, 'predicted': None}
    testing_data['min_u_regressor_sparse'][model]['predicted'] = deepcopy(prediction)
    testing_data['min_u_regressor_sparse'][model]['real'] = deepcopy(data_min_u_sparse['y_test'])

In [None]:
# min_u regression focused
if 'min_u_regressor_focused.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training min_u regression focused')
    # Linear Regression
    regressor_min_u_focused = my_ai.Context(strategy=my_ai.LinearRegressionStrategy())
    regressor_min_u_focused.fit(data=data_min_u_focused)
    # Gradient Boost Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_gradient_boost_regression_focused_min_u.csv'])
    regressor_min_u_focused.strategy = my_ai.GradientBoostRegressorStrategy(hyper_params)
    regressor_min_u_focused.fit(data=data_min_u_focused)
    # Extreme GBoost Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_xgboost_regression_focused_min_u.csv'])
    regressor_min_u_focused.strategy = my_ai.XGBoostRegressorStrategy(hyper_params)
    regressor_min_u_focused.fit(data=data_min_u_focused)
    # Support Vector Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_support_vector_regression_focused_min_u.csv'])
    regressor_min_u_focused.strategy = my_ai.SupportVectorRegressorStrategy(hyper_params)
    regressor_min_u_focused.fit(data=data_min_u_focused)
    # MLP Regression
    hyper_params = get_hyper_params_from_df(focused_hyper_params['params_mlp_regressor_focused_min_u.csv'])
    hyper_params['input_size'] = data_min_u_focused['X_train'].shape[1]
    hyper_params['output_size'] = data_min_u_focused['y_train'].shape[1]
    regressor_max_u_focused.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_min_u_focused)
    utils.serialize_object('pickles\dataset_benchmark\min_u_regressor_focused', regressor_min_u_focused)
else:
    print('Loading min_u regression focused')
    regressor_min_u_focused = utils.deserialize_object('pickles\dataset_benchmark\min_u_regressor_focused')

testing_data['min_u_regressor_focused'] = {}
for model, strategy in zip(models, regressor_min_u_focused.strategies):
    prediction = strategy.predict(data=data_min_u_sparse)
    prediction = pd.DataFrame(prediction, columns=data_min_u_sparse['y_test'].columns)
    testing_data['min_u_regressor_focused'][model] = {'real': None, 'predicted': None}
    testing_data['min_u_regressor_focused'][model]['predicted'] = deepcopy(prediction)
    testing_data['min_u_regressor_focused'][model]['real'] = deepcopy(data_min_u_sparse['y_test'])

In [None]:
# min_u classification
if 'min_u_classifier.pickle' not in os.listdir('pickles\dataset_benchmark'):
    print('Training min_u classification')
    # Gradient Boost Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_gradient_boost_classifier_max_u.csv'])
    classifier_min_u = my_ai.Context(strategy=my_ai.GradientBoostClassifierStrategy(hyper_params))
    classifier_min_u.fit(data=data_min_u_bool)
    # Extreme GBoost Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_xgboost_classifier_min_u.csv'])
    classifier_min_u.strategy = my_ai.XGBoostClassifierStrategy(hyper_params)
    classifier_min_u.fit(data=data_min_u_bool)
    # Support Vector Classifier
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_support_vector_classifier_min_u.csv'])
    classifier_min_u.strategy = my_ai.SupportVectorClassifierStrategy(hyper_params)
    classifier_min_u.fit(data=data_min_u_bool)
    utils.serialize_object('pickles\dataset_benchmark\min_u_classifier', classifier_min_u)
    # MLP Regression
    hyper_params = get_hyper_params_from_df(boolean_hyper_params['params_mlp_classifier_min_u.csv'])
    hyper_params['input_size'] = data_min_u_bool['X_train'].shape[1]
    hyper_params['output_size'] = data_min_u_bool['y_train'].shape[1]
    regressor_max_u_focused.strategy = my_ai.MultilayerPerceptronStrategy(hyper_params)
    regressor_max_u_focused.fit(data=data_min_u_bool)
else: 
    print('Loading min_u classification')
    classifier_min_u = utils.deserialize_object('pickles\dataset_benchmark\min_u_classifier')

testing_data['min_u_classifier'] = {}
for model, strategy in zip(models, classifier_min_u.strategies):
    prediction = strategy.predict(data=data_min_u_bool)
    prediction = pd.DataFrame(prediction, columns=data_min_u_bool['y_test'].columns)
    testing_data['min_u_classifier'][model] = {'real': None, 'predicted': None}
    testing_data['min_u_classifier'][model]['predicted'] = deepcopy(prediction)
    testing_data['min_u_classifier'][model]['real'] = deepcopy(data_min_u_bool['y_test'])

## Results Discussion
In this section the results of the training and testing are presented and compared. The main objectives of this experience is to compare the performance of the regression models in terms of the hybrid metrics confusion matrix and the hybrid metrics rmse. The comparisons will be the following:
- Compare the confusion matrices of the classification models and the regression models evaluate with the hybrid metrics.
- Compare the error results of the regression models trained with the focused dataset and the sparse dataset. 

In [None]:
# Testing all models: Function that receives a dict with the real and predicted values, and outputs a dataframe with the results of the metrics.
# Accumulate all the classifications for each bus.
tp, tn, fp, fn = 0, 0, 0, 0
for bus in testing_data['max_u_classifier']['gb']['predicted'].columns:
    # Compute tp, tn, fp, fn
    tp += sum((testing_data['max_u_classifier']['gb']['predicted'][bus] == 1) & (testing_data['max_u_classifier']['gb']['real'][bus] == 1))
    tn += sum((testing_data['max_u_classifier']['gb']['predicted'][bus] == 0) & (testing_data['max_u_classifier']['gb']['real'][bus] == 0))
    fp += sum((testing_data['max_u_classifier']['gb']['predicted'][bus] == 1) & (testing_data['max_u_classifier']['gb']['real'][bus] == 0))
    fn += sum((testing_data['max_u_classifier']['gb']['predicted'][bus] == 0) & (testing_data['max_u_classifier']['gb']['real'][bus] == 1))
    # try:
    #     _tp, _tn, _fp, _fn = confusion_matrix(testing_data['max_u_classifier']['gb']['real'][bus], testing_data['max_u_classifier']['gb']['predicted'][bus]).ravel()
    #     tp += _tp; tn += _tn; fp += _fp; fn += _fn
    # except: 
    #     print('Problem with bus: ', bus)
print('{} + {} = {} = {} possible positive values.'.format(tp, fn, tp+fn, testing_data['max_u_classifier']['gb']['real'].sum().sum()))
print('{} + {} = {} = {} possible negative values.'.format(tn, fp, tn+fp, testing_data['max_u_classifier']['gb']['real'].shape[0]*testing_data['max_u_classifier']['gb']['real'].shape[1] - testing_data['max_u_classifier']['gb']['real'].sum().sum()))


In [None]:
# Build a multi-index dataframe with the results of the metrics. The first index is the testing_data.keys(), the second index are the tp, tn, fp, fn, and the columns are the models.
columns = ['tp', 'tn', 'fp', 'fn', '(hybrid)accuracy', '(hybrid)precision', '(hybrid)recall', '(hybrid)f1']
index = pd.MultiIndex.from_product([testing_data.keys(), ['lr', 'gb', 'xgb', 'svr']], names=['experiment', 'class'])
df = pd.DataFrame(index=index, columns=columns)
classifier_experiments =[experiment for experiment in testing_data.keys() if 'classifier' in experiment.split('_')]
regressor_experiments = [experiment for experiment in testing_data.keys() if 'regressor' in experiment.split('_')]
# Classifier experiments
for experiment in classifier_experiments:
    for model in testing_data[experiment].keys():
        for bus in testing_data[experiment][model]['predicted'].columns:
            try:
                tp += sum((testing_data[experiment][model]['predicted'][bus] == 1) & (testing_data[experiment][model]['real'][bus] == 1))
                tn += sum((testing_data[experiment][model]['predicted'][bus] == 0) & (testing_data[experiment][model]['real'][bus] == 0))
                fp += sum((testing_data[experiment][model]['predicted'][bus] == 1) & (testing_data[experiment][model]['real'][bus] == 0))
                fn += sum((testing_data[experiment][model]['predicted'][bus] == 0) & (testing_data[experiment][model]['real'][bus] == 1))
            except: 
                print('In the experiment ', experiment, ' and model ', model, ' there was a problem with bus: ', bus)
                if not testing_data[experiment][model]['real'][bus].any():
                    print('Bus {} has no positive data points. Just ignore the little shit.'.format(bus))    
        df.loc[(experiment, model), 'tp'] = tp
        df.loc[(experiment, model), 'tn'] = tn
        df.loc[(experiment, model), 'fp'] = fp
        df.loc[(experiment, model), 'fn'] = fn
        print('Experiment: {}, model: {}, tp: {}, tn: {}, fp: {}, fn: {}'.format(experiment, model, tp, tn, fp, fn))
        if (tp + tn + fp + fn) != 0:
            accuracy = (tp + tn ) / (tp + tn + fp + fn)
        else: 
            accuracy = 0
        if (tp + fp) != 0:
            precision = tp / (tp + fp)
        else:
            precision = 0
        if (tp + fn) != 0:
            recall = tp / (tp + fn)
        else:
            recall = 0
        if (precision + recall) != 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0
        df.loc[(experiment, model), '(hybrid)accuracy'] = accuracy
        df.loc[(experiment, model), '(hybrid)precision'] = precision
        df.loc[(experiment, model), '(hybrid)recall'] = recall
        df.loc[(experiment, model), '(hybrid)f1'] = f1
        # print('Experiment: {}, model: {}, accuracy: {}, precision: {}, recall: {}, f1: {}'.format(experiment, model, accuracy, precision, recall, f1))
        tp = 0
        tn = 0
        fp = 0
        fn = 0 
# Regressor experiments.
for experiment in regressor_experiments:
    for model in testing_data[experiment].keys():
        test_data = testing_data[experiment][model]['real']
        threshold = test_data.loc[:, test_data.max(axis=0) != 0].max(axis=0).mean() * 0.1 
        hybrid_metrics = metrics.Metrics()
        hybrid_metrics.get_prediction_scores(testing_data[experiment][model]['predicted'], testing_data[experiment][model]['real'], threshold=threshold)
        df.loc[(experiment, model), 'tp'] = hybrid_metrics.true_positives_ctr
        df.loc[(experiment, model), 'tn'] = hybrid_metrics.true_negatives_ctr
        df.loc[(experiment, model), 'fp'] = hybrid_metrics.false_positives_ctr
        df.loc[(experiment, model), 'fn'] = hybrid_metrics.false_negatives_ctr
        df.loc[(experiment, model), '(hybrid)accuracy'] = hybrid_metrics.hybrid_accuracy
        df.loc[(experiment, model), '(hybrid)precision'] = hybrid_metrics.hybrid_precision
        df.loc[(experiment, model), '(hybrid)recall'] = hybrid_metrics.hybrid_recall
        df.loc[(experiment, model), '(hybrid)f1'] = hybrid_metrics.hybrid_f1
        # print('Experiment: {}, model: {}, tp: {}, tn: {}, fp: {}, fn: {}'.format(experiment, model, hybrid_metrics.true_positives_ctr, hybrid_metrics.true_negatives_ctr, hybrid_metrics.false_positives_ctr, hybrid_metrics.false_negatives_ctr))
        # print('Experiment: {}, model: {}, accuracy: {}, precision: {}, recall: {}, f1: {}'.format(experiment, model, hybrid_metrics.hybrid_accuracy, hybrid_metrics.hybrid_precision, hybrid_metrics.hybrid_recall, hybrid_metrics.hybrid_f1))

In [None]:
df