# Import libraries

In [None]:
import warnings
warnings.filterwarnings( 'ignore' )

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Basic libraries
import os
import math
import json
import pickle
import numpy    as np
import pandas   as pd
from   datetime import datetime
import matplotlib
import matplotlib.pyplot as plt

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn
from sklearn                 import metrics
from sklearn.model_selection import train_test_split
from sklearn                 import preprocessing
from sklearn.metrics         import ConfusionMatrixDisplay
from sklearn.metrics         import PrecisionRecallDisplay
import skopt

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# XGBoost
import xgboost

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# User libraries
from utils.Logger   import *

# Parameters

In [None]:
# HPO parameters 
n_calls         =  10
n_random_starts =  1

# XGBoost - parameters
n_estimators          = 1000
early_stopping_rounds = 50
seed                  = 42

# Other parameters
VERBOSE = True

# Create directory for storing output figures
if not os.path.isdir('Figures'): os.mkdir('Figures')

In [None]:
# Initiate logger
#
if VERBOSE:
    logger = init_logger( log_file = 'logs.log' ) 

# Loading data

## Training data

In [None]:
df_train = pd.read_csv('Data/train_data.csv')

if VERBOSE:
    logger.info(f'Training data were loaded')
    logger.info(f'Number of instances:  {df_train.shape[0]}')
    logger.info(f'Number of features:   {df_train.shape[1]}')

df_train.head( 3 )

## Testing data

In [None]:
df_test = pd.read_csv('Data/test_data.csv')

if VERBOSE:
    logger.info(f'Testing data were loaded')
    logger.info(f'Number of instances:  {df_test.shape[0]}')
    logger.info(f'Number of features:   {df_test.shape[1]}')

df_test.head( 3 )

## Training/Testing sets

In [None]:
# Training/Validation data
trainX = df_train.iloc[:, :-1]
trainY = df_train.iloc[:,  -1]
trainX, validX, trainY, validY = train_test_split(trainX, trainY, test_size = 0.2, random_state = seed)

# # Convert dataset to special XGBoost optimised data structure
# dtrain = xgboost.DMatrix(trainX, label = trainY)
# dvalid = xgboost.DMatrix(validX, label = validY)


# Testing data
testX  = df_test.iloc[:, :-1]
testY  = df_test.iloc[:,  -1]

# Hyperparameter optimization

In [None]:
# Initiate mlflow server
# Command: mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 127.0.0.1 --port 5000
# 
import mlflow
from   mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("http://127.0.0.1:1983/")
mlflow.set_experiment("NVCR-Experiments")

if VERBOSE:
    logger.info('MLFlow server is connected')

In [None]:
class Parameter_Evaluation():
    def __init__(self, trainX, trainY, validX, validY, VERBOSE = True):
        # Data
        self.trainX = trainX
        self.trainY = trainY
        self.validX = validX
        self.validY = validY
        # 
        self.VERBOSE = VERBOSE
        # Number of iterations
        self.Iter = 1
        # Best score
        self.best_score = 0
        
    def select_model(self, model):
        self.model = model
    
    def get_performance_evaluation(self, y, pred):
        Accuracy  = 100.0 * metrics.accuracy_score(y, pred)
        try:
            AUC   = metrics.roc_auc_score(y, pred)
        except:
            AUC   = 0.0
        Recall    = metrics.recall_score(y, pred)
        Precision = metrics.precision_score(y, pred)     

        # Calculate Confusion Matrix (CM)
        CM = metrics.confusion_matrix(y, pred)
        GM = math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )
            
        return Accuracy, AUC, Precision, Recall, GM, CM
    
    def evaluate_params(self, params):
        
        tag     = {"Simulation" : "sample-" + str(self.Iter), "model": "XGBoost"}
        runname = "XGBoost-test-run-" + str(self.Iter)

        with mlflow.start_run(run_name = runname) as run:
            # Tags to help in tracking
            mlflow.set_tags(tag)
            
            # Log params/hyperparameters used in experiement
            mlflow.log_params(params)
            
    
            # Setup model
            #
            model =  self.model.set_params( **params )
            
            # Train model
            #
            model.fit(self.trainX, self.trainY, 
                      eval_metric = 'auc', 
                      eval_set = [ (self.trainX, self.trainY), (self.validX, self.validY) ],
                      early_stopping_rounds = early_stopping_rounds,
                      verbose = 100);

            
            # Evaluation on Training set
            pred = model.predict( self.trainX )        
            Accuracy, AUC, Precision, Recall, GM, CM = self.get_performance_evaluation(self.trainY.values, pred)
            # Log metrics to MLflow
            mlflow.log_metric("train_Accuracy", Accuracy)
            mlflow.log_metric("train_AUC", AUC)
            mlflow.log_metric("train_Recall", Recall)
            mlflow.log_metric("train_Precision", Precision)
            mlflow.log_metric("train_GM", GM)
            
            # Confusion matrix (Training)
            disp = ConfusionMatrixDisplay(confusion_matrix=CM, display_labels=model.classes_)
            ax = plt.figure(figsize=(4,4)).gca()
            fig = disp.plot(ax=ax,cmap = 'Blues', colorbar=False);
            plt.title('Confusion Matrix - Train')
            fig.figure_.savefig('Figures/CM_train.png', dpi=100)
            plt.show()
            
            # PR-curve (Training)
            ax = plt.figure(figsize=(4,3)).gca()
            display = PrecisionRecallDisplay.from_estimator(model, self.trainX, self.trainY, ax=ax)
            display.ax_.set_title("2-class Precision-Recall curve");
            display.figure_.savefig('Figures/PR-curve_train.png', dpi=100)
            plt.show()
            
            
            # Evaluation on Validation set
            pred = model.predict( self.validX )        
            Accuracy, AUC, Precision, Recall, GM, CM = self.get_performance_evaluation(self.validY.values, pred)
            # Log metrics to MLflow
            mlflow.log_metric("valid_Accuracy", Accuracy)
            mlflow.log_metric("valid_AUC", AUC)
            mlflow.log_metric("valid_Recall", Recall)
            mlflow.log_metric("valid_Precision", Precision)
            mlflow.log_metric("valid_GM", GM)
            
            # Confusion matrix (Testing)
            disp = ConfusionMatrixDisplay(confusion_matrix=CM, display_labels=model.classes_)
            ax = plt.figure(figsize=(4, 4)).gca()
            fig = disp.plot(ax=ax,cmap = 'Blues', colorbar=False);
            plt.title('Confusion Matrix - Test')
            fig.figure_.savefig('Figures/CM_test.png', dpi=100)
            plt.show()
            
            # PR-curve (Testing)
            ax = plt.figure(figsize=(4,3)).gca()
            display = PrecisionRecallDisplay.from_estimator(model, self.validX, self.validY, ax=ax)
            display.ax_.set_title("2-class Precision-Recall curve");
            display.figure_.savefig('Figures/PR-curve_test.png', dpi=100)
            plt.show()
            
            
            
            # Export results
            if (AUC > self.best_score): self.best_score = AUC
            if self.VERBOSE:
                logger.info( "Iteration {:3.0f} with Accuracy = {:6.3f}% AUC = {:6.3f} GM = {:6.3f}".format(self.Iter, Accuracy, AUC, GM) )
            
            
            # Include model signature
            signature = infer_signature(self.validX, pred)
            
            # Log model created
            mlflow.sklearn.log_model(model, artifact_path = "models", signature = signature) 

            mlflow.log_artifacts("Data", artifact_path="Data")
            mlflow.log_artifacts("Figures", artifact_path="Figures")
            
        mlflow.end_run()
          
        # Update Iteration counter
        self.Iter += 1
        
        
        return( -AUC )

In [None]:
evaluator = Parameter_Evaluation(trainX, trainY, validX, validY)

# Prediction model

In [None]:
# Setup model
#
model = xgboost.XGBClassifier(n_estimators        = n_estimators, 
                              n_jobs              = -1, 
                              objective           = 'binary:logistic', 
                              validate_parameters = True, 
                              verbosity           = 1,
                              tree_method         = 'hist',
                              gamma               = 1.5,
                              reg_alpha           = 20,
                              reg_lambda          = 0.7) 

if VERBOSE:
    logger.info('Model was setup')
model

## Parameters

In [None]:
# XGBoost
search_space = [ 
                 skopt.space.Categorical([0.1, 0.05, 0.01, 0.05], name='learning_rate'),
                 skopt.space.Integer(3, 15,   name='max_depth'),
                 #
                 skopt.space.Integer(2, 10,   name='min_child_weight'),
                 skopt.space.Integer(2, 5,    name='max_leaves'),
                ]

In [None]:
HPO_params = {
              'n_calls':         n_calls,
              'n_random_starts': n_random_starts,
              'base_estimator':  'ET',
              'acq_func':        'EI',
             }

## Hyperparameter optimization process

In [None]:
evaluator.select_model( model )

In [None]:
@skopt.utils.use_named_args( search_space )
def objective( **params ):
    return  evaluator.evaluate_params( params )

In [None]:
%%time
results = skopt.forest_minimize(objective, search_space, **HPO_params)

## Get optimized hyperparameters

In [None]:
def to_named_params(results, search_space):
    params       = results.x
    param_dict   = {}
    
    params_list  =[(dimension.name, param) for dimension, param in zip(search_space, params)]
    
    for item in params_list:
        param_dict[item[0]] = item[1]
    
    return( param_dict )

In [None]:
best_params = to_named_params(results, search_space)


print('[INFO] Optimized hyperparameters\n')
for (parameter,value) in best_params.items():
    if ( isinstance(value, float) ):
        print(' >%25s: %.3f' % (parameter,value))
    else:
        print(' >%25s: %s' % (parameter,value))




# Store optimized hyperparameters
#
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

with open('checkpoint/Hyperparameters.json', 'w', encoding='utf-8') as f:
    f.write( json.dumps( best_params, default = np_encoder ) )
