# Import libraries

In [1]:
import warnings
warnings.filterwarnings( 'ignore' )

In [2]:
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Basic libraries
#
import math
import json
import numpy    as np
import pandas   as pd
from   datetime import datetime

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn
#
from sklearn                 import metrics
from sklearn.model_selection import train_test_split
from sklearn                 import preprocessing


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Sklearn-Optimization
#
import skopt


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# XGBoost
#
import xgboost


# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# User libraries
#
from utils.Logger   import *

# Parameters

In [3]:
# HPO parameters 
#
n_calls         = 10
n_random_starts = 2


# XGBoost - parameters
#
n_estimators          = 10
early_stopping_rounds = 10
seed                  = 42


# Other parameters
#
VERBOSE = True

In [4]:
# Initiate logger
#
if VERBOSE:
    logger = init_logger( log_file = 'logs.log' ) 

# Import data

**Dataset**

- Irrigation 


**Context**

The score is to predict if a region is 'irrigated' or 'drainaged' based on satellite multi-temporal data (indices)

# Loading data

## Training data

In [5]:
df_train = pd.read_csv('Data/Irrigation_train.csv')

logger.info(f'Training data were loaded')
logger.info(f'Number of instances:  {df_train.shape[0]}')
logger.info(f'Number of features:   {df_train.shape[1]}')

df_train.head( 3 )

[INFO] Training data were loaded
[INFO] Number of instances:  82503
[INFO] Number of features:   29


Unnamed: 0,B11_5,B12_5,B05_5,B06_5,NDVI_5,NDBI_5,LSWI2_5,B11_7,B12_7,B05_7,...,NDBI_9,LSWI2_9,B11_11,B12_11,B05_11,B06_11,NDVI_11,NDBI_11,LSWI2_11,Irrigation
0,0.325158,0.211388,0.162878,0.199771,0.16384,-0.267323,0.014312,0.324716,0.20425,0.17465,...,-0.264708,0.041466,0.31536,0.201311,0.161962,0.19163,0.159782,-0.262815,0.039018,0
1,0.28381,0.185874,0.204715,0.292734,0.321123,-0.359163,0.263899,0.283403,0.185829,0.195995,...,-0.35367,0.28756,0.262824,0.169689,0.170315,0.27006,0.288574,-0.334296,0.24833,1
2,0.323059,0.200238,0.199566,0.288144,0.34712,-0.39276,0.259642,0.350821,0.223944,0.210791,...,-0.439905,0.268981,0.323004,0.19542,0.183496,0.256162,0.310311,-0.361256,0.211741,1


## Testing data

In [6]:
df_test = pd.read_csv('Data/Irrigation_test.csv')

logger.info(f'Testing data were loaded')
logger.info(f'Number of instances:  {df_test.shape[0]}')
logger.info(f'Number of features:   {df_test.shape[1]}')

df_test.head( 3 )

[INFO] Testing data were loaded
[INFO] Number of instances:  9168
[INFO] Number of features:   29


Unnamed: 0,B11_5,B12_5,B05_5,B06_5,NDVI_5,NDBI_5,LSWI2_5,B11_7,B12_7,B05_7,...,NDBI_9,LSWI2_9,B11_11,B12_11,B05_11,B06_11,NDVI_11,NDBI_11,LSWI2_11,Irrigation
0,0.4211,0.31128,0.31016,0.35398,0.105571,-0.175377,0.091689,0.42082,0.30632,0.31042,...,-0.193568,0.095023,0.38828,0.27398,0.28598,0.33056,0.105511,-0.177666,0.116607,0
1,0.423193,0.290619,0.254927,0.281671,0.13756,-0.244459,0.019792,0.431956,0.301435,0.257873,...,-0.239738,0.043115,0.379723,0.252394,0.238974,0.265378,0.135794,-0.245699,0.070134,1
2,0.319944,0.240518,0.192145,0.214765,0.104318,-0.246053,-0.044005,0.343052,0.246591,0.211662,...,-0.236559,0.004437,0.333574,0.236871,0.211484,0.236254,0.110759,-0.240793,0.024197,0


## Pre-processing data

In [7]:
# Setup Label-Encoder
#
LabelEncoding = preprocessing.LabelEncoder()

# Fit encoder
#
LabelEncoding.fit( df_train[ 'Irrigation' ] )

# Apply encoder
df_train[ 'Irrigation' ] = LabelEncoding.transform( df_train['Irrigation' ] )
df_test[ 'Irrigation' ]  = LabelEncoding.transform( df_test[ 'Irrigation']  )

logger.info('Target class was transformed using Label-Encoding')

[INFO] Target class was transformed using Label-Encoding


# Training/Testing sets

In [8]:
# Training/Validation data
trainX = df_train.iloc[:, :-1]
trainY = df_train.iloc[:,  -1]
trainX, validX, trainY, validY = train_test_split(trainX, trainY, test_size = 0.2, random_state = seed)

# # Convert dataset to special XGBoost optimised data structure
# dtrain = xgboost.DMatrix(trainX, label = trainY)
# dvalid = xgboost.DMatrix(validX, label = validY)


# Testing data
testX  = df_test.iloc[:, :-1]
testY  = df_test.iloc[:,  -1]

# Hyperparameter optimization

In [9]:
# Initiate mlflow server
# Command: mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 0.0.0.0 --port 5000
# 
import mlflow
from   mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("http://0.0.0.0:5000/")
mlflow.set_experiment("Irrigation-Experiment")

logger.info('MLFlow server is connected')

2022/09/21 02:10:53 INFO mlflow.tracking.fluent: Experiment with name 'Irrigation-Experiment' does not exist. Creating a new experiment.
[INFO] MLFlow server is connected


In [10]:
class Parameter_Evaluation():
    def __init__(self, trainX, trainY, validX, validY):
        # Data
        self.trainX = trainX
        self.trainY = trainY
        self.validX = validX
        self.validY = validY
        # Number of iterations
        self.Iter        = 1
        # Best score
        self.best_score  = 0
        
    def select_model(self, model):
        self.model = model

        
    def evaluate_params(self, params):
        
        tag     = {"Simulation" : "sample-" + str(self.Iter), "model": "XGBoost"}
        runname = "XGBoost-test-run-" + str(self.Iter)

        with mlflow.start_run(run_name = runname) as run:
            # Tags to help in tracking
            mlflow.set_tags(tag)

            # Log params/hyperparameters used in experiement
            mlflow.log_params(params)
            

            # Setup model
            #
            model =  self.model.set_params( **params )
            
            # Train model
            #
            model.fit(self.trainX, self.trainY, 
                    eval_metric = 'auc', 
                    eval_set = [ (self.validX, self.validY) ],
                    early_stopping_rounds = early_stopping_rounds);

        
            # Evaluation
            #
            pred = model.predict( self.validX )
            #
            Accuracy  = 100.0 * metrics.accuracy_score( pred, self.validY )
            try:
                AUC   = metrics.roc_auc_score( pred, self.validY )
            except:
                AUC   = 0.0
            Recall    = metrics.recall_score( pred, self.validY )
            Precision = metrics.precision_score( pred, self.validY )        
            
            # Calculate Confusion Matrix (CM)
            CM = metrics.confusion_matrix(self.validY, pred)
            GM = math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )

            
            # Export results
            if (AUC > self.best_score): self.best_score = AUC
            
            logger.info( "Iteration {:3.0f} with Accuracy = {:6.3f}% AUC = {:6.3f} GM = {:6.3f}".format(self.Iter, Accuracy, AUC, GM) )
               
            

            mlflow.log_metric("Accuracy",       Accuracy)
            mlflow.log_metric("AUC",            AUC)
            mlflow.log_metric("Recall",         Recall)
            mlflow.log_metric("Precision",      Precision)
            mlflow.log_metric("Geometric Mean", GM)
            
            signature = infer_signature(self.validX, pred)
            
            # Log model created
            mlflow.sklearn.log_model(model, artifact_path = "models", signature = signature) 

        mlflow.end_run()
          
        # Update Iteration counter
        self.Iter += 1
        
        
        return( -AUC )

In [11]:
evaluator = Parameter_Evaluation(trainX, trainY, validX, validY)

# Prediction model

In [12]:
# Setup model
#
model = xgboost.XGBClassifier(n_estimators        = n_estimators, 
                              n_jobs              = -1, 
                              objective           = 'binary:logistic', 
                              validate_parameters = True, 
                              verbosity           = 1) 

logger.info('Model was setup')
model

[INFO] Model was setup


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=10, n_jobs=-1,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)

In [13]:
# import xgboost



# # Convert dataset to special XGBoost optimised data structure
# dtrain_matrix = xgboost.DMatrix(trainX, label = trainY)
# # dcomp_matrix  = xgb.DMatrix(cd_enc)


# # List of parameters
# params = {
#     'booster': 'gbtree',
#     'objective': 'reg:squarederror',
#     'learning_rate': 0.3,
#     'n_jobs': -1,
# }

# # Fit the model
# xgb_cv = xgboost.cv(params                = params,
#                     dtrain                = dtrain_matrix,
#                     num_boost_round       = num_boost_round,
#                     nfold                 = nfold, 
#                     show_stdv             = False,
#                     metrics               = ['auc', 'aucpr'], 
#                     as_pandas             = True,
#                     stratified            = True,
#                     seed                  = seed,
#                     early_stopping_rounds = early_stopping_rounds, 
# )



## Parameters

In [14]:
# XGBoost
search_space = [ 
                 skopt.space.Real(0.01, 0.1,  name='learning_rate'),
                 skopt.space.Integer(3, 15,   name='max_depth'),
                 #
                 skopt.space.Real(1, 9,       name='gamma'),
                 skopt.space.Integer(40, 180, name='reg_alpha'),
                 skopt.space.Real(0, 1,       name='reg_lambda'),
                 #
                 skopt.space.Integer(2, 10,   name='min_child_weight'),
                 skopt.space.Integer(2, 5,    name='max_leaves'),
                 #
                 skopt.space.Categorical(categories = ['gbtree', 'dart'], name = "booster")
]

In [15]:
HPO_params = {
              'n_calls':         n_calls,
              'n_random_starts': n_random_starts,
              'base_estimator':  'ET',
              'acq_func':        'EI',
             }

## Hyperparameter optimization process

In [16]:
evaluator.select_model( model )

In [17]:
@skopt.utils.use_named_args( search_space )
def objective( **params ):
    return  evaluator.evaluate_params( params )

In [18]:
%%time
results = skopt.forest_minimize(objective, search_space, **HPO_params)

[0]	validation_0-auc:0.70497
[1]	validation_0-auc:0.70797
[2]	validation_0-auc:0.71720
[3]	validation_0-auc:0.72041
[4]	validation_0-auc:0.72707
[5]	validation_0-auc:0.73654
[6]	validation_0-auc:0.73951
[7]	validation_0-auc:0.74619
[8]	validation_0-auc:0.74816
[9]	validation_0-auc:0.74942


[INFO] Iteration   1 with Accuracy = 76.117%% AUC =  0.763 GM =  0.529


[0]	validation_0-auc:0.68881
[1]	validation_0-auc:0.69394
[2]	validation_0-auc:0.70241
[3]	validation_0-auc:0.70606
[4]	validation_0-auc:0.71273
[5]	validation_0-auc:0.71773
[6]	validation_0-auc:0.72036
[7]	validation_0-auc:0.72080
[8]	validation_0-auc:0.72318
[9]	validation_0-auc:0.73020


[INFO] Iteration   2 with Accuracy = 75.256%% AUC =  0.751 GM =  0.499


[0]	validation_0-auc:0.73792
[1]	validation_0-auc:0.74479
[2]	validation_0-auc:0.74971
[3]	validation_0-auc:0.75563
[4]	validation_0-auc:0.75660
[5]	validation_0-auc:0.76463
[6]	validation_0-auc:0.76522
[7]	validation_0-auc:0.76656
[8]	validation_0-auc:0.76919
[9]	validation_0-auc:0.77058


[INFO] Iteration   3 with Accuracy = 76.868%% AUC =  0.771 GM =  0.554


[0]	validation_0-auc:0.70383
[1]	validation_0-auc:0.70481
[2]	validation_0-auc:0.70899
[3]	validation_0-auc:0.71086
[4]	validation_0-auc:0.71119
[5]	validation_0-auc:0.71292
[6]	validation_0-auc:0.71592
[7]	validation_0-auc:0.71657
[8]	validation_0-auc:0.71708
[9]	validation_0-auc:0.71826


[INFO] Iteration   4 with Accuracy = 75.874%% AUC =  0.743 GM =  0.545


[0]	validation_0-auc:0.70546
[1]	validation_0-auc:0.70841
[2]	validation_0-auc:0.70923
[3]	validation_0-auc:0.71091
[4]	validation_0-auc:0.71144
[5]	validation_0-auc:0.71317
[6]	validation_0-auc:0.71335
[7]	validation_0-auc:0.71449
[8]	validation_0-auc:0.71452
[9]	validation_0-auc:0.71534


[INFO] Iteration   5 with Accuracy = 75.789%% AUC =  0.728 GM =  0.573


[0]	validation_0-auc:0.69332
[1]	validation_0-auc:0.69372
[2]	validation_0-auc:0.69692
[3]	validation_0-auc:0.70155
[4]	validation_0-auc:0.70489
[5]	validation_0-auc:0.70612
[6]	validation_0-auc:0.70757
[7]	validation_0-auc:0.70855
[8]	validation_0-auc:0.70980
[9]	validation_0-auc:0.71016


[INFO] Iteration   6 with Accuracy = 74.577%% AUC =  0.715 GM =  0.523


[0]	validation_0-auc:0.69559
[1]	validation_0-auc:0.70346
[2]	validation_0-auc:0.70508
[3]	validation_0-auc:0.71487
[4]	validation_0-auc:0.72016
[5]	validation_0-auc:0.72135
[6]	validation_0-auc:0.72461
[7]	validation_0-auc:0.72645
[8]	validation_0-auc:0.73308
[9]	validation_0-auc:0.73721


[INFO] Iteration   7 with Accuracy = 75.141%% AUC =  0.753 GM =  0.492


[0]	validation_0-auc:0.70609
[1]	validation_0-auc:0.71013
[2]	validation_0-auc:0.71367
[3]	validation_0-auc:0.71526
[4]	validation_0-auc:0.72187
[5]	validation_0-auc:0.72586
[6]	validation_0-auc:0.73152
[7]	validation_0-auc:0.73417
[8]	validation_0-auc:0.74000
[9]	validation_0-auc:0.74078


[INFO] Iteration   8 with Accuracy = 75.941%% AUC =  0.764 GM =  0.519


[0]	validation_0-auc:0.74420
[1]	validation_0-auc:0.74915
[2]	validation_0-auc:0.75385
[3]	validation_0-auc:0.76163
[4]	validation_0-auc:0.76240
[5]	validation_0-auc:0.76485
[6]	validation_0-auc:0.76493
[7]	validation_0-auc:0.77232
[8]	validation_0-auc:0.77445
[9]	validation_0-auc:0.77563


[INFO] Iteration   9 with Accuracy = 78.032%% AUC =  0.781 GM =  0.594


[0]	validation_0-auc:0.71535
[1]	validation_0-auc:0.72958
[2]	validation_0-auc:0.73003
[3]	validation_0-auc:0.73104
[4]	validation_0-auc:0.73575
[5]	validation_0-auc:0.73761
[6]	validation_0-auc:0.74603
[7]	validation_0-auc:0.74789
[8]	validation_0-auc:0.75055
[9]	validation_0-auc:0.75440


[INFO] Iteration  10 with Accuracy = 76.983%% AUC =  0.767 GM =  0.565


CPU times: user 2min 10s, sys: 5.48 s, total: 2min 16s
Wall time: 42.2 s


## Get optimized hyperparameters

In [19]:
def to_named_params(results, search_space):
    params       = results.x
    param_dict   = {}
    
    params_list  =[(dimension.name, param) for dimension, param in zip(search_space, params)]
    
    for item in params_list:
        param_dict[item[0]] = item[1]
    
    return( param_dict )

In [20]:
best_params = to_named_params(results, search_space)


print('[INFO] Optimized hyperparameters\n')
for (parameter,value) in best_params.items():
    if ( isinstance(value, float) ):
        print(' >%25s: %.3f' % (parameter,value))
    else:
        print(' >%25s: %s' % (parameter,value))

[INFO] Optimized hyperparameters

 >            learning_rate: 0.056
 >                max_depth: 10
 >                    gamma: 2.793
 >                reg_alpha: 51
 >               reg_lambda: 0.549
 >         min_child_weight: 2
 >                  booster: dart


# Optimized (best) model setup

In [21]:
# Define model
#
model.set_params( **best_params )
logger.info('Optimized-Model was loaded')

# Train model
#
model.fit(trainX, trainY,
          eval_metric = 'auc',           
          eval_set = [ (validX, validY) ],
          early_stopping_rounds = 10);

logger.info('Optimized-Model trained')

[INFO] Optimized-Model was loaded


[0]	validation_0-auc:0.74420
[1]	validation_0-auc:0.74915
[2]	validation_0-auc:0.75385
[3]	validation_0-auc:0.76163
[4]	validation_0-auc:0.76240
[5]	validation_0-auc:0.76485
[6]	validation_0-auc:0.76493
[7]	validation_0-auc:0.77232
[8]	validation_0-auc:0.77445
[9]	validation_0-auc:0.77563


[INFO] Optimized-Model trained


In [23]:
# Get predictions
#
pred = model.predict( testX )

# Calculate Confusion Matrix (CM)
#
CM  = metrics.confusion_matrix(testY, pred)
#
#
logger.info( 30*"-" )
logger.info( "*** Evaluation ***")
logger.info( "> Accuracy:  %.2f%%" % metrics.accuracy_score( pred, testY ) )
logger.info( "> AUC:       %.3f"   % metrics.roc_auc_score(pred, testY) )
logger.info( "> Recall:    %.3f"   % metrics.recall_score(testY, pred) )
logger.info( "> Precision: %.3f"   % metrics.precision_score(testY, pred) )
logger.info( "> GM:        %.3f\n" % (math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )) )


CM

[INFO] ------------------------------
[INFO] *** Evaluation ***
[INFO] > Accuracy:  0.78%
[INFO] > AUC:       0.775
[INFO] > Recall:    0.954
[INFO] > Precision: 0.780
[INFO] > GM:        0.587



array([[ 980, 1736],
       [ 294, 6158]])

In [24]:
# ModelID = '0a7e804a25b74a18b560817b8d871e48'
# logged_model = 'runs:/{}/models'.format( ModelID )
# loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Get predictions
# #
# pred = loaded_model.predict( testX )

# # Calculate Confusion Matrix (CM)
# #
# CM  = metrics.confusion_matrix(testY, pred)
# #
# #
# logger.info( 30*"-" )
# logger.info( "*** Evaluation ***")
# logger.info( "> Accuracy:  %.2f%%" % metrics.accuracy_score( pred, testY ) )
# logger.info( "> AUC:       %.3f"   % metrics.roc_auc_score(pred, testY) )
# logger.info( "> Recall:    %.3f"   % metrics.recall_score(testY, pred) )
# logger.info( "> Precision: %.3f"   % metrics.precision_score(testY, pred) )
# logger.info( "> GM:        %.3f\n" % (math.sqrt( np.diag( CM ).prod() ) / math.sqrt( CM[0, :].sum() * CM[1, :].sum() )) )


# CM