# Import libraries

In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
# Basic libraries
import os
import numpy    as np
import pandas   as pd
# Optuna library
import optuna
# XGBoost
import xgboost
# User libraries
from utils.Logger import *
from utils.utils import *
from utils.mlflow_logging import MLflow_log_performance

# Parameters

In [None]:
# Number of Optuna trials 
n_trials = 5
# Seed
seed = 42 
# Create logger
VERBOSE = True 
# Number of splits for Stratified Cross-Validation
n_splits = 10
# Hold-out percentage
test_size = 0.2

# Create temp directory for storing output figures
if not os.path.isdir('Performance'): os.mkdir('Performance')

In [None]:
# Initiate logger
if VERBOSE:
    logger = init_logger(log_file = 'logs.log') 

# Loading data

In [None]:
df = pd.read_csv('Data/dataset.csv').dropna()

if VERBOSE:
    logger.info(f'Training data were loaded')
    logger.info(f'Number of instances:  {df.shape[0]}')
    logger.info(f'Number of features:   {df.shape[1]}')

df['Class'].hist(figsize=(4, 2))

df.head(3)

## Split data (training/holdout set)

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=test_size, stratify=df['Class'], random_state=seed)

# Get X & y
trainX = df_train.iloc[:, :-1].values
trainY = df_train.iloc[:,  -1].values.astype('int')

testX = df_test.iloc[:, :-1].values
testY = df_test.iloc[:,  -1].values.astype('int')

# Hyperparameter optimization

In [None]:
# Initiate mlflow server
# Command: mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 127.0.0.1 --port 5000
# 
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Experiment1 XGBoost")

if VERBOSE:
    logger.info('MLFlow server is connected')

## Prediction model

In [None]:
def objective(trial: optuna.trial.Trial) -> float: 
    # Logger
    if VERBOSE:
        logger.info(f'Trail: {trial.number} started [{trial.datetime_start}]')

    # Parameters
    params = {
        'n_estimators'      : trial.suggest_categorical('n_estimators', [50, 100, 200]),
        'learning_rate'     : trial.suggest_categorical('learning_rate' , [1e-3, 1e-4, 1e-5]),
        'max_depth'         : trial.suggest_int('max_depth', 3, 5),
        'reg_alpha'         : trial.suggest_categorical('reg_alpha', [10, 20, 30]),
        'reg_lambda'        : 1.0, #trial.suggest_loguniform('reg_lambda', 0, 1),
        'gamma'             : 1, #trial.suggest_loguniform('gamma', 1 , 9),
        'min_child_weight'  : 2, #trial.suggest_int('min_child_weight', 2, 4),
        'max_leaves'        : 2, #trial.suggest_int('max_leaves', 2, 5),
    }


    # Setup model
    model = xgboost.XGBClassifier(objective           = 'multi:softmax',  
                                    n_jobs              = -1,                                   
                                    validate_parameters = True, 
                                    verbosity           = 1,
                                    tree_method         = 'hist',
                                    **params)
    
    # Cross-Validation
    train_CV_results, test_CV_results, CM_cv = cross_validation(model=model, X=trainX, Y=trainY, n_splits=n_splits, seed=seed, VERBOSE=VERBOSE)

    # Single-run 
    model, train_results, test_results, CM, predictions = single_run(model=model, trainX=trainX, trainY=trainY, testX=testX, testY=testY, VERBOSE=VERBOSE)

         
    if VERBOSE:
        logger.info(f'Trail: {trial.number} completed')

    # Include model signature
    signature = mlflow.models.infer_signature(testX, predictions)
    # Log performance to MLflow 
    MLflow_log_performance(trial.number, model, 
                           train_CV_results, test_CV_results, CM_cv, 
                           train_results, test_results, CM,
                           signature, params)

    
    if VERBOSE:
        logger.info(f'Trail: {trial.number} completed')


    return np.mean(test_CV_results['AUC'])

## Hyperparameter optimization process

In [None]:
study = optuna.create_study(direction='maximize',
                            sampler=optuna.samplers.TPESampler(seed = seed),
                           )

study.optimize(func=objective, 
               n_trials=n_trials, 
               n_jobs=1,)

In [None]:
# Delete temp directory
import shutil
shutil.rmtree('Performance')

### Visualization of the learning curves of the trials


In [None]:
optuna.visualization.plot_optimization_history(study)

### Visualization of hyperparameter relationships


In [None]:
optuna.visualization.plot_parallel_coordinate(study, params=[params for params in study.best_params])

### Hyperparameter importances


In [None]:
optuna.visualization.plot_param_importances(study, 
                                            target      = lambda t: t.duration.total_seconds(), 
                                            target_name = "duration")

### Get optimized hyperparameters

In [None]:
print('[INFO] Optimized hyperparameters\n')
for (parameter,value) in study.best_params.items():
    if ( isinstance(value, float) ):
        print(' >%25s: %.3f' % (parameter,value))
    else:
        print(' >%25s: %s' % (parameter,value))