# Import libraries

In [None]:
import warnings
warnings.filterwarnings( 'ignore' )

In [None]:
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Basic libraries
#
import math
import json
import pickle
import numpy    as np
import pandas   as pd
from   datetime import datetime

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn
#
from sklearn                 import metrics
from sklearn.model_selection import train_test_split
from sklearn                 import preprocessing


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn-Optimization
#
import skopt


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# XGBoost
#
import xgboost


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# User libraries
#
from utils.Logger   import *

# Parameters

In [None]:
# HPO parameters 
#
n_calls         = 100
n_random_starts =  10


# XGBoost - parameters
#
n_estimators          = 1000
early_stopping_rounds = 50
seed                  = 42


# Other parameters
#
VERBOSE = True

In [None]:
# Initiate logger
#
if VERBOSE:
    logger = init_logger( log_file = 'logs.log' ) 

# Import data

**Dataset**

- Irrigation 


**Context**

The scope is to predict if a region is 'irrigated' or 'drainaged' based on satellite multi-temporal data (indices)

# Loading data

## Training data

In [None]:
df_train = pd.read_csv('Data/Irrigation_train.csv')

if VERBOSE:
    logger.info(f'Training data were loaded')
    logger.info(f'Number of instances:  {df_train.shape[0]}')
    logger.info(f'Number of features:   {df_train.shape[1]}')

df_train.head( 3 )

## Testing data

In [None]:
df_test = pd.read_csv('Data/Irrigation_test.csv')

if VERBOSE:
    logger.info(f'Testing data were loaded')
    logger.info(f'Number of instances:  {df_test.shape[0]}')
    logger.info(f'Number of features:   {df_test.shape[1]}')

df_test.head( 3 )

## Pre-processing data

In [None]:
# Setup Label-Encoder
#
LabelEncoding = preprocessing.LabelEncoder()

# Fit encoder
#
LabelEncoding.fit( df_train[ 'Irrigation' ] )

# Apply encoder
df_train[ 'Irrigation' ] = LabelEncoding.transform( df_train['Irrigation' ] )
df_test[ 'Irrigation' ]  = LabelEncoding.transform( df_test[ 'Irrigation']  )

if VERBOSE:
    logger.info('Target class was transformed using Label-Encoding')

# Training/Testing sets

In [None]:
# Training/Validation data
trainX = df_train.iloc[:, :-1]
trainY = df_train.iloc[:,  -1]
trainX, validX, trainY, validY = train_test_split(trainX, trainY, test_size = 0.1, random_state = seed)

# # Convert dataset to special XGBoost optimised data structure
# dtrain = xgboost.DMatrix(trainX, label = trainY)
# dvalid = xgboost.DMatrix(validX, label = validY)


# Testing data
testX  = df_test.iloc[:, :-1]
testY  = df_test.iloc[:,  -1]

# Hyperparameter optimization

In [None]:
# Initiate mlflow server
# Command: mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 0.0.0.0 --port 5000
# 
import mlflow
from   mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("http://0.0.0.0:5000/")
mlflow.set_experiment("Irrigation-Experiment")

if VERBOSE:
    logger.info('MLFlow server is connected')

In [None]:
class Parameter_Evaluation():
    def __init__(self, trainX, trainY, validX, validY, VERBOSE = True):
        # Data
        self.trainX = trainX
        self.trainY = trainY
        self.validX = validX
        self.validY = validY
        # 
        self.VERBOSE = VERBOSE
        # Number of iterations
        self.Iter        = 1
        # Best score
        self.best_score  = 0
        
    def select_model(self, model):
        self.model = model

        
    def evaluate_params(self, params):
        
        tag     = {"Simulation" : "sample-" + str(self.Iter), "model": "XGBoost"}
        runname = "XGBoost-test-run-" + str(self.Iter)

        with mlflow.start_run(run_name = runname) as run:
            # Tags to help in tracking
            mlflow.set_tags(tag)

            # Log params/hyperparameters used in experiement
            mlflow.log_params(params)
            

            # Setup model
            #
            model =  self.model.set_params( **params )
            
            # Train model
            #
            model.fit(self.trainX, self.trainY, 
                    eval_metric = 'auc', 
                    eval_set = [ (self.validX, self.validY) ],
                    early_stopping_rounds = early_stopping_rounds);

        
            # Evaluation
            #
            pred = model.predict( self.validX )
            #
            Accuracy  = 100.0 * metrics.accuracy_score( pred, self.validY )
            try:
                AUC   = metrics.roc_auc_score( pred, self.validY )
            except:
                AUC   = 0.0
            Recall    = metrics.recall_score( pred, self.validY )
            Precision = metrics.precision_score( pred, self.validY )        
            
            # Calculate Confusion Matrix (CM)
            CM = metrics.confusion_matrix(self.validY, pred)
            GM = math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )

            
            # Export results
            if (AUC > self.best_score): self.best_score = AUC
            
            if self.VERBOSE:
                logger.info( "Iteration {:3.0f} with Accuracy = {:6.3f}% AUC = {:6.3f} GM = {:6.3f}".format(self.Iter, Accuracy, AUC, GM) )
               
            

            mlflow.log_metric("Accuracy",       Accuracy)
            mlflow.log_metric("AUC",            AUC)
            mlflow.log_metric("Recall",         Recall)
            mlflow.log_metric("Precision",      Precision)
            mlflow.log_metric("Geometric Mean", GM)
            
            signature = infer_signature(self.validX, pred)
            
            # Log model created
            mlflow.sklearn.log_model(model, artifact_path = "models", signature = signature) 

        mlflow.end_run()
          
        # Update Iteration counter
        self.Iter += 1
        
        
        return( -AUC )

In [None]:
evaluator = Parameter_Evaluation(trainX, trainY, validX, validY)

# Prediction model

In [None]:
# Setup model
#
model = xgboost.XGBClassifier(n_estimators        = n_estimators, 
                              n_jobs              = -1, 
                              objective           = 'binary:logistic', 
                              validate_parameters = True, 
                              verbosity           = 1) 

if VERBOSE:
    logger.info('Model was setup')
model

In [None]:
# import xgboost



# # Convert dataset to special XGBoost optimised data structure
# dtrain_matrix = xgboost.DMatrix(trainX, label = trainY)
# # dcomp_matrix  = xgb.DMatrix(cd_enc)


# # List of parameters
# params = {
#     'booster': 'gbtree',
#     'objective': 'reg:squarederror',
#     'learning_rate': 0.3,
#     'n_jobs': -1,
# }

# # Fit the model
# xgb_cv = xgboost.cv(params                = params,
#                     dtrain                = dtrain_matrix,
#                     num_boost_round       = num_boost_round,
#                     nfold                 = nfold, 
#                     show_stdv             = False,
#                     metrics               = ['auc', 'aucpr'], 
#                     as_pandas             = True,
#                     stratified            = True,
#                     seed                  = seed,
#                     early_stopping_rounds = early_stopping_rounds, 
# )



## Parameters

In [None]:
# XGBoost
search_space = [ 
                 skopt.space.Real(0.01, 0.1,  name='learning_rate'),
                 skopt.space.Integer(3, 15,   name='max_depth'),
                 #
                 skopt.space.Real(1, 9,       name='gamma'),
                 skopt.space.Integer(40, 180, name='reg_alpha'),
                 skopt.space.Real(0, 1,       name='reg_lambda'),
                 #
                 skopt.space.Integer(2, 10,   name='min_child_weight'),
                 skopt.space.Integer(2, 5,    name='max_leaves'),
                 #
                 skopt.space.Categorical(categories = ['gbtree', 'dart'], name = "booster")
]

In [None]:
HPO_params = {
              'n_calls':         n_calls,
              'n_random_starts': n_random_starts,
              'base_estimator':  'ET',
              'acq_func':        'EI',
             }

## Hyperparameter optimization process

In [None]:
evaluator.select_model( model )

In [None]:
@skopt.utils.use_named_args( search_space )
def objective( **params ):
    return  evaluator.evaluate_params( params )

In [None]:
%%time
results = skopt.forest_minimize(objective, search_space, **HPO_params)

## Get optimized hyperparameters

In [None]:
def to_named_params(results, search_space):
    params       = results.x
    param_dict   = {}
    
    params_list  =[(dimension.name, param) for dimension, param in zip(search_space, params)]
    
    for item in params_list:
        param_dict[item[0]] = item[1]
    
    return( param_dict )

In [None]:
best_params = to_named_params(results, search_space)


print('[INFO] Optimized hyperparameters\n')
for (parameter,value) in best_params.items():
    if ( isinstance(value, float) ):
        print(' >%25s: %.3f' % (parameter,value))
    else:
        print(' >%25s: %s' % (parameter,value))




# Store optimized hyperparameters
#
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

with open('checkpoint/Hyperparameters.json', 'w', encoding='utf-8') as f:
    f.write( json.dumps( best_params, default = np_encoder ) )


# Optimized (best) model setup

In [None]:
# Define model
#
model.set_params( **best_params )
logger.info('Optimized-Model was loaded')

# Train model
#
model.fit(trainX, trainY,
          eval_metric = 'auc',           
          eval_set = [ (validX, validY) ],
          early_stopping_rounds = 10);

if VERBOSE:
    logger.info('Optimized-Model trained')


# Save trained model
#
import pickle
filename = 'checkpoint/model.pkl'
pickle.dump(model, open(filename, 'wb'))
if VERBOSE:
    logger.info(f'Model saved in {filename}')

In [None]:
# Get predictions
#
pred = model.predict( testX )

# Calculate Confusion Matrix (CM)
#
CM  = metrics.confusion_matrix(testY, pred)
#
#
logger.info( 30*"-" )
logger.info( "*** Evaluation ***")
logger.info( "> Accuracy:  %.2f%%" % (100*metrics.accuracy_score( pred, testY )) )
logger.info( "> AUC:       %.3f"   % metrics.roc_auc_score(pred, testY) )
logger.info( "> Recall:    %.3f"   % metrics.recall_score(testY, pred) )
logger.info( "> Precision: %.3f"   % metrics.precision_score(testY, pred) )
logger.info( "> GM:        %.3f\n" % (math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )) )


CM

In [None]:
# ModelID = '0a7e804a25b74a18b560817b8d871e48'
# logged_model = 'runs:/{}/models'.format( ModelID )
# loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Get predictions
# #
# pred = loaded_model.predict( testX )

# # Calculate Confusion Matrix (CM)
# #
# CM  = metrics.confusion_matrix(testY, pred)
# #
# #
# logger.info( 30*"-" )
# logger.info( "*** Evaluation ***")
# logger.info( "> Accuracy:  %.2f%%" % metrics.accuracy_score( pred, testY ) )
# logger.info( "> AUC:       %.3f"   % metrics.roc_auc_score(pred, testY) )
# logger.info( "> Recall:    %.3f"   % metrics.recall_score(testY, pred) )
# logger.info( "> Precision: %.3f"   % metrics.precision_score(testY, pred) )
# logger.info( "> GM:        %.3f\n" % (math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )) )


# CM