# Joint Training Hybrid Model
The second approach is to have the regression model to incorporate in its training and testing the result of the classification model. The objective is to incorporate the ground truth of of the classification data set into the training data of the regression data set and demonstrated, that is, allow the model to learn to predict the amplitude of a constraint, given that that constraint exists. 

During testing, the truthful constraint violation Boolean feature is substituted by the prediction of the classification model, and the 
model will predict the amplitude of the constraint violation, given the prediction of its occurrence. 

## Model selection
Considering the datasets benchmark, the best regression model for max_u and min_u are: 
- max_u: gb balanced
- min_u: gb sparse

The best classification model for max_u and min_u are:
- max_u: xgb balanced
- min_u: gb sparse


In [1]:
import sys;sys.path.append('..');from thesis_package import aimodels as my_ai, utils, metrics
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import os

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [2]:
exogenous_data = pd.read_csv('..\data\processed\production\exogenous_data_extended.csv').drop(columns=['date'])

In [3]:
# Classification data balanced
y_max_u_balanced_class = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_balanced_bool_constr.csv')
exogenous_data_balanced_max_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_max_balanced.csv').drop(columns=['date'])
y_min_u_balanced_class = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_balanced_bool_constr.csv')
exogenous_data_balanced_min_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_min_balanced.csv').drop(columns=['date'])
y_max_u_balanced_class = y_max_u_balanced_class[utils.cols_with_positive_values(y_max_u_balanced_class)]
y_min_u_balanced_class = y_min_u_balanced_class[utils.cols_with_positive_values(y_min_u_balanced_class)]

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_max_u, y_max_u_balanced_class, test_size=0.2, scaling=True)
data_max_u_bool_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_min_u, y_min_u_balanced_class, test_size=0.2, scaling=True)
data_min_u_bool_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [4]:
# Regresison data balanced
y_max_u_balanced = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_balanced_constr.csv')
exogenous_data_balanced_max_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_max_balanced.csv').drop(columns=['date'])
y_min_u_balanced = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_balanced_constr.csv')
exogenous_data_balanced_min_u = pd.read_csv('..\data\ground_truth\exogenous_data_vm_pu_min_balanced.csv').drop(columns=['date'])

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_max_u, y_max_u_balanced, test_size=0.2, scaling=True)
data_max_u_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data_balanced_min_u, y_min_u_balanced, test_size=0.2, scaling=True)
data_min_u_balanced = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [5]:
# Classification data sparse
y_max_u_bool = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_sparse_bool_constr.csv').drop(columns=['timestamps'])
y_min_u_bool = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_sparse_bool_constr.csv').drop(columns=['timestamps'])
y_max_u_bool = y_max_u_bool[utils.cols_with_positive_values(y_max_u_bool)]
y_min_u_bool = y_min_u_bool[utils.cols_with_positive_values(y_min_u_bool)]

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_max_u_bool, test_size=0.2, scaling=True)
data_max_u_bool = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_min_u_bool, test_size=0.2, scaling=True)
data_min_u_bool = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [6]:
# Regression data sparse
y_max_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_max_constr.csv').drop(columns=['timestamps'])
y_min_u_sparse = pd.read_csv('..\data\ground_truth\\res_bus_vm_pu_min_constr.csv').drop(columns=['timestamps'])

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_max_u_sparse, test_size=0.2, scaling=True)
data_max_u_sparse = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

train_x, test_x, train_y, test_y, scaler = utils.split_and_suffle(exogenous_data, y_min_u_sparse, test_size=0.2, scaling=True)
data_min_u_sparse = {'X_train': deepcopy(train_x), 'X_test': deepcopy(test_x), 'y_train': deepcopy(train_y), 'y_test': deepcopy(test_y), 'scaler': deepcopy(scaler)}

In [16]:
max_u_threshold = utils.compute_threshold(y_max_u_sparse)
min_u_threshold = utils.compute_threshold(y_min_u_sparse)

## Prepared Hybrid data sets

### Training


In [7]:
# max u
data_max_u_hybrid = {}
data_max_u_hybrid['X_train'] = pd.concat([data_max_u_balanced['X_train'], data_max_u_bool_balanced['y_train']], axis=1)
data_max_u_hybrid['y_train'] = deepcopy(data_max_u_balanced['y_train'])
# min u 
data_min_u_hybrid = {}
data_min_u_hybrid['X_train'] = pd.concat([data_min_u_sparse['X_train'], data_min_u_bool['y_train']], axis=1)
data_min_u_hybrid['y_train'] = deepcopy(data_min_u_sparse['y_train'])

### Testing

In [8]:
# max u
classifier_max_u_balanced = utils.deserialize_object('pickles\dataset_benchmark\max_u_classifier_balanced')
max_u_class_xgb = deepcopy(classifier_max_u_balanced.strategies[1])
print(classifier_max_u_balanced.strategies[1])
class_result_max_u = max_u_class_xgb.predict(data_max_u_bool)
data_max_u_hybrid['X_test'] = pd.concat([data_max_u_sparse['X_test'], class_result_max_u], axis=1)
data_max_u_hybrid['y_test'] = deepcopy(data_max_u_sparse['y_test'])
# min u 
classifier_min_u_sparse = utils.deserialize_object('pickles\dataset_benchmark\min_u_classifier')
min_u_class_gb = deepcopy(classifier_min_u_sparse.strategies[1])
print(classifier_min_u_sparse.strategies[0])
class_result_min_u = min_u_class_gb.predict(data_min_u_bool)
data_min_u_hybrid['X_test'] = pd.concat([data_min_u_sparse['X_test'], class_result_min_u], axis=1)
data_min_u_hybrid['y_test'] = deepcopy(data_min_u_sparse['y_test'])

<thesis_package.aimodels.XGBoostClassifierStrategy object at 0x000001C2B2D163A0>
<thesis_package.aimodels.GradientBoostClassifierStrategy object at 0x000001C2B51B50D0>


In [9]:
print('data_max_u_hybrid X_train shape: ', data_max_u_hybrid['X_train'].shape)
print('data_max_u_hybrid y_train shape: ', data_max_u_hybrid['y_train'].shape)
print('data_max_u_hybrid X_test shape: ', data_max_u_hybrid['X_test'].shape)
print('data_max_u_hybrid y_test shape: ', data_max_u_hybrid['y_test'].shape)
print('\n')
print('data_min_u_hybrid X_train shape: ', data_min_u_hybrid['X_train'].shape)
print('data_min_u_hybrid y_train shape: ', data_min_u_hybrid['y_train'].shape)
print('data_min_u_hybrid X_test shape: ', data_min_u_hybrid['X_test'].shape)
print('data_min_u_hybrid y_test shape: ', data_min_u_hybrid['y_test'].shape)


data_max_u_hybrid X_train shape:  (5561, 22)
data_max_u_hybrid y_train shape:  (5561, 34)
data_max_u_hybrid X_test shape:  (9044, 22)
data_max_u_hybrid y_test shape:  (9044, 34)


data_min_u_hybrid X_train shape:  (36172, 21)
data_min_u_hybrid y_train shape:  (36172, 34)
data_min_u_hybrid X_test shape:  (9044, 21)
data_min_u_hybrid y_test shape:  (9044, 34)


## Training

In [12]:
balanced_hyper_params = {}
sparse_hyper_params = {}
for file in os.listdir('hyper_params_results_mcc'):
    if file.endswith('.csv') and 'regression_sparse' in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        sparse_hyper_params[file] = df
    elif file.endswith('.csv') and 'regression_balanced' in file:
        df = pd.read_csv(os.path.join('hyper_params_results', file))
        balanced_hyper_params[file] = df
    else:
        pass
import ast
def get_hyper_params_from_df(df):
    output = {}
    for row in df.iterrows():
        if row[1]['params'] != 'value':
            try:
                output[row[1]['params']] = ast.literal_eval(row[1]['value'])
            except :
                output[row[1]['params']] = row[1]['value']
    return output


# max_u
if 'hybrid_regressor_max_u.pickle' not in os.listdir('pickles\hybrid_models_benchmark'):
    print('training max u hybrid model... wait for it...')
    hyper_params = get_hyper_params_from_df(balanced_hyper_params['params_gradient_boost_regression_balanced_max_u.csv'])
    hybrid_regressor_max_u = my_ai.Context(strategy=my_ai.GradientBoostRegressorStrategy(hyper_params)) 
    hybrid_regressor_max_u.fit(data=data_max_u_hybrid)
    utils.serialize_object('pickles\hybrid_models_benchmark\hybrid_regressor_max_u', hybrid_regressor_max_u)
else: 
    print('loading max u hybrid model...')
    hybrid_regressor_max_u = utils.deserialize_object('pickles\hybrid_models_benchmark\hybrid_regressor_max_u')

# min_u 
if 'hybrid_regressor_min_u.pickle' not in os.listdir('pickles\hybrid_models_benchmark'):
    hyper_params = get_hyper_params_from_df(sparse_hyper_params['params_gradient_boost_regression_sparse_min_u.csv'])
    hybrid_regressor_min_u = my_ai.Context(my_ai.GradientBoostRegressorStrategy(hyper_params))
    hybrid_regressor_min_u.fit(data=data_min_u_hybrid)
    utils.serialize_object('pickles\hybrid_models_benchmark\hybrid_regressor_min_u', hybrid_regressor_min_u)
else:
    print('loading min u hybrid model...')
    hybrid_regressor_min_u = utils.deserialize_object('pickles\hybrid_models_benchmark\hybrid_regressor_min_u')

training max u hybrid model... wait for it...


## Testing

In [14]:
# Load models
# max_u
regression_max_u_focused = utils.deserialize_object('pickles\dataset_benchmark\max_u_regressor_focused')
max_u_reg_lr = deepcopy(regression_max_u_focused.strategies[0])
print(max_u_reg_lr)
reg_result = max_u_reg_lr.predict(data_max_u_sparse)

<thesis_package.aimodels.LinearRegressionStrategy object at 0x000001C2B51F7E20>
<thesis_package.aimodels.GradientBoostRegressorStrategy object at 0x000001C2C0D1D760>


In [17]:
# max_u
metric = metrics.Metrics()
_threshold = lambda experiment: max_u_threshold / data_max_u_sparse['scaler']['y'] if 'max_u' in experiment else min_u_threshold/ data_min_u_sparse['scaler']['y']
scaled_threshold = _threshold('max_u')
metric.get_prediction_scores(reg_result, data_max_u_sparse['y_test'], threshold=scaled_threshold)
reg_accuracy = metric.hybrid_accuracy
reg_precision = metric.hybrid_precision
reg_recall = metric.hybrid_recall
reg_f1 = metric.hybrid_f1
reg_mcc = metric.hybrid_mcc
reg_evaluation = pd.Series({'recall': reg_recall, 'precision': reg_precision, 'f1': reg_f1, 'accuracy': reg_accuracy, 'mcc': reg_mcc})
# same for hybrid model 
# max_u
hybrid_reg_result = hybrid_regressor_max_u.predict(data_max_u_hybrid['X_test'])
metric.get_prediction_scores(hybrid_reg_result, data_max_u_hybrid['y_test'], threshold=scaled_threshold)
hybrid_reg_accuracy = metric.hybrid_accuracy
hybrid_reg_precision = metric.hybrid_precision
hybrid_reg_recall = metric.hybrid_recall
hybrid_reg_f1 = metric.hybrid_f1
hybrid_reg_mcc = metric.hybrid_mcc
hybrid_reg_evaluation = pd.Series({'recall': hybrid_reg_recall, 'precision': hybrid_reg_precision, 'f1': hybrid_reg_f1, 'accuracy': hybrid_reg_accuracy, 'mcc': hybrid_reg_mcc})
pd.DataFrame([reg_evaluation, hybrid_reg_evaluation], index=['regression', 'hybrid_regression'])


true_positives_ctr:  2
true_negatives_ctr:  288882
false_positives_ctr:  13665
false_negatives_ctr:  4947
6012819352381992129


KeyError: 'X_test'

In [None]:
# min_u
regression_min_u_sparse = utils.deserialize_object('pickles\dataset_benchmark\min_u_regressor_sparse')
min_u_reg_gb = deepcopy(regression_min_u_sparse.strategies[1])
print(min_u_reg_gb)
reg_result = min_u_reg_gb.predict(data_min_u_sparse)

In [None]:
# min_u
scaled_threshold = _threshold('min_u')
metric.get_prediction_scores(reg_result, data_min_u_sparse['y_test'], threshold=scaled_threshold)
reg_accuracy = metric.hybrid_accuracy
reg_precision = metric.hybrid_precision
reg_recall = metric.hybrid_recall
reg_f1 = metric.hybrid_f1
reg_mcc = metric.hybrid_mcc
reg_evaluation = pd.Series({'recall': reg_recall, 'precision': reg_precision, 'f1': reg_f1, 'accuracy': reg_accuracy, 'mcc': reg_mcc})
# same for hybrid model 
# min_u
hybrid_reg_result = hybrid_regressor_min_u.predict(data_min_u_hybrid['X_test'])
metric.get_prediction_scores(hybrid_reg_result, data_min_u_hybrid['y_test'], threshold=scaled_threshold)
hybrid_reg_accuracy = metric.hybrid_accuracy
hybrid_reg_precision = metric.hybrid_precision
hybrid_reg_recall = metric.hybrid_recall
hybrid_reg_f1 = metric.hybrid_f1
hybrid_reg_mcc = metric.hybrid_mcc
hybrid_reg_evaluation = pd.Series({'recall': hybrid_reg_recall, 'precision': hybrid_reg_precision, 'f1': hybrid_reg_f1, 'accuracy': hybrid_reg_accuracy, 'mcc': hybrid_reg_mcc})
pd.DataFrame([reg_evaluation, hybrid_reg_evaluation], index=['regression', 'hybrid_regression'])