# XGB Hyperparameter tuning

## Load modules

In [184]:
# %load_ext autoreload
# %autoreload 2

# Load modules
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), '..')))

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import multiprocessing
import random
import time

import argparse 
import numpy as np
import pandas as pd
import pickle
import joblib

# Load custom functions
from utils import ds_general as ds
from utils.BigQuery import BigQuery

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def load_blob_list(config, train_or_test='train', verbose = 1):    
    """
    return list of blobs of train or test data
    """
    if train_or_test.lower() =='train':
        blob = config['TRAIN_DATA_BLOB']
        max_files = config['MODEL_RUN_PARAMS']['MAX_N_FILES']
    elif train_or_test.lower() == 'test':
        blob = config['TEST_DATA_BLOB']
        max_files = config['MODEL_RUN_PARAMS']['TOTAL_FILES_TO_TEST']
    else:
        ds.terminate_prog("select 'train' or 'test' for data blob", error = e)
        
    try: 
        blobs_list = bq.list_blobs(config['BUCKET'], f"{blob}")
        blobs_list = [f"gs://{config['BUCKET']}/" + b for b in blobs_list]

        n_blobs = len(blobs_list) if not max_files \
            else max_files
        print(f"total num of {train_or_test} blobs {len(blobs_list)}")
        print(f"num of the selected {train_or_test} blobs {n_blobs}")
        return blobs_list[:n_blobs]
    
    except Exception as e:
        print(e)
        ds.terminate_prog(f"Failed to load {train_or_test}blob list", error = e)
    

In [4]:
# load data from blob
def load_data_from_blob(blob_path):
    """
    Load model data from a blob.
    Return a data frame
    """
    temp_df = pd.DataFrame()
    result_df = pd.DataFrame()
    try:
        if type(blob_path) == str:
            blob_path = [blob_path]
        for i, blob in enumerate(blob_path):
            print(f"{i+1}th blob loading......")
            temp_df = pd.read_parquet(blob)
            result_df = result_df.reset_index(drop=True)
            result_df = pd.concat([result_df, temp_df], axis=0)
        
        return result_df
            

    except Exception as e:
        msg = "Failed to load the blob."
        print(msg)
        print(e)
        ds.terminate_prog(msg, error=e)
        

In [6]:
def make_train_data(df, config, get_sample_weight = True):
    """
    Convert raw dataframe to X, y train data.
    Sample weight vectors of the length of the data is also returned if set True.
    Include data cleaning.
    
    Return: (X, y, sample_weight_vector)
        X: features panda dataframe (n_rows, n_cols)
        y: panda series (n_rows, )
        sample_weight_vector: numpy array (n_rows, )
        
    """
    result = ()
    cols_to_drop = []
    try:
        df = df.dropna(subset=[config['Y_LABEL_COL']], axis = 0)
        
        for col in config['COLS_TO_DROP']:
            if col in df.columns:
                cols_to_drop.append(col)
    
        df = df.drop(cols_to_drop, axis = 1)
        y = df[config['Y_LABEL_COL']].astype(int)
        X = df.drop(config['Y_LABEL_COL'], axis=1)
        
        if get_sample_weight:
            sample_weight_vector = np.where(
                y == config['WEIGHT_LABEL'], 
                config['POS_SAMPLE_WEIGHT'], 1)
            result = (X, y, sample_weight_vector)
        else:
            result = (X, y, None)
        
    except Exception as e:
        msg = "Failed to make train data from the dataframe!"
        print(msg)
        print(e)
        ds.terminate_prog(msg, error = e)

    
    return result

In [23]:
def make_model_id(config, prefix='model_'):
    if prefix:
        return prefix + "_" + config['MODEL_VERSION'] + str(ds.name_time_id())
    else:
        return config['MODEL_VERSION'] + str(ds.name_time_id())
    

In [8]:
def get_config(f_path, verbose=2):
    """
    Load a config yaml file.
    """
    config = ds.load_config(f_path)
    if verbose > 1:
        print(config)
    elif verbose > 0:
        print(f"config keys: {config.keys()}")
    
    return config

In [13]:
def get_model_data_multi_processing(blobs_list, config, get_sample_weight = True):
    """
    Multi processing of loading model data including data cleaning.
    Return: outputs of make_train_data()
        
    """
    if not isinstance(blobs_list, list):
        blobs_list = [blobs_list]
    try: 
        print(f"Data loading total {len(blobs_list)} blobs starting {blobs_list[0]}...")
        
        splited_loading_list = ds.split_list_n_size(
                blobs_list, 
                config['MODEL_RUN_PARAMS']['N_FILES_EACH_LOADING'])
        df_list = []
        
        p = multiprocessing.Pool(config['MODEL_RUN_PARAMS']['N_CPUS'])
        pool_results = p.map(
            func=load_data_from_blob, 
            iterable=splited_loading_list)
        p.close()
        p.join()
        
        data_df = merge_df_list(pool_results)

        results = make_train_data(
            df=data_df,
            config=config,
            get_sample_weight=True)
        
        return results
        
    except Exception as e:
        msng = f"Failed in geting model data by multi processing starting {blobs_list[0]}."
        print(e)
        ds.terminate_prog(msg=msg, error=e)
    

In [9]:
def merge_df_list(df_list):
    if type(df_list) != list:
        raise Exception("ERROR: should be a lsit of df to merge!!!")
    print(f"merging {len(df_list)} dataframes started.....")
    try:
        result_df = pd.DataFrame()
        for df in df_list:
            result_df = result_df.reset_index(drop=True)
            result_df = pd.concat([result_df, df], axis=0)
        return result_df
    except Exception as e:
        msg = "failed to merge dataframes"
        print(msg)
        print(e)
        ds.terminate_prog(msg=msg)

In [28]:
## Load best parameters
def load_best_params(bucket_name=None, blob_name=None, 
                     param_fname = None,
                     param_dict = None,
                     verbose = 1):
    params = None
    source = None
    try:
        if blob_name:
            param_fname = './tmp/temp_best_params.pkl'
            bq.download_file_from_blob(
                bucket_name,
                blob_name,
                param_fname,
                verbose
            )
            with open(param_fname, 'rb') as file:
                params = pickle.load(file)
                
            source = "BLOB"
        elif param_fname:
            with open(param_fname, 'rb') as file:
                params = pickle.load(file)
                
            source = "FILE"
        elif param_dict:
            params = param_dict
            
            source = "CONFIG/MANUAL"
        else:
            source = "NO SOURCE"
            
    except Exception as e:
        msg = "Failed to load the best_params"
        print(e)
        ds.terminate_prog(msg=msg, error=e)
    
    print(f"Hyper Parameters Loaded from {source}.....")
    if verbose > 0:
        print(params)
        
    return params

In [11]:
def print_metrcs_inline(eval_y, y_pred, loss, best_iter):
    # evaluate predictions
    accuracy = accuracy_score(eval_y, y_pred)
    balanced_accuracy_value = balanced_accuracy_score(eval_y, y_pred)
    f1_value = f1_score(eval_y, y_pred)
    recall_val = recall_score(eval_y, y_pred)
    roc_auc_val = roc_auc_score(eval_y, y_pred)
    
    min_loss = min(loss)
    last_loss = loss[-1]
    epochs = len(loss)

    print(
        "| Accu: %.2f%%" % (accuracy * 100.0), \
        "| Bal Accu: %.2f%%" %(balanced_accuracy_value * 100.0), \
        "| ROC AUC: %.3f" %(roc_auc_val), \
        "| Recall Pos: %.3f" %(recall_val), \
        "| F1 Pos: %.3f" %(f1_value), \
        "| Epochs: %.0i" %(epochs), \
        "| Best Iter Num: %.0i" %(best_iter), \
        "| Min loss: %.3f" %(min_loss), \
        "| Last loss: %.3f" %(last_loss) \
    )

In [196]:
def get_rand_search_cv(config):
    # Trainig model set up
    
    params_grid = load_best_params(
        param_dict = config['TUNE_PARAMS'])
    

    estimator = XGBClassifier(
        eval_metric=config['MODEL_RUN_PARAMS']['EVAL_METRIC'], 
        seed=123)

    random_search = RandomizedSearchCV(
                        estimator=estimator, 
                        param_distributions=params_grid, 
                        scoring = config['MODEL_RUN_PARAMS']['CV_MODEL_SCORE'],
                        n_iter= config['MODEL_RUN_PARAMS']['RSCV_N_ITERS'], 
                        cv=config['MODEL_RUN_PARAMS']['RSCV_CV'],
                        verbose=config['VERBOSE'],
                        random_state=123)
    return random_search

In [14]:
def main(config_file_name):
    logger = ds.get_logger(level='INFO')
    
    ds.make_folder('tmp')
    
    temp_model_fname = './tmp/temp_tuning_model_save.model'
    
    main_time = ds.time_start_end(msg="MAIN")
    # load configuration from a yaml file
    config = get_config(config_file_name)

    
    # load list of blobs
    # actual train test will be loaded in each batch process
    blobs_list = load_blob_list(config, train_or_test='train')
    
    # load test blobs and data
    test_data_time = ds.time_start_end(msg="test_data_load")
    test_blobs_list = load_blob_list(config, train_or_test='test')
    test_X, test_y, test_sample_weight_vector = get_model_data_multi_processing(
            test_blobs_list, config, get_sample_weight=False)
    
    ds.df_info(test_X, label="test data feature dataframe")
    ds.time_start_end(started=test_data_time, msg="test_data_load")
    
    
    #############################
    # Train in Bath processing in Serial: 
    # continuous Training batch by batch
    #############################
    split_batch_blob_list = ds.split_list_n_size(
        blobs_list, 
        config['MODEL_RUN_PARAMS']['N_FILES_IN_BATCH'])
    total_batch = len(split_batch_blob_list)
    
    # Iterate each batch
    for i, blobs_list in enumerate(split_batch_blob_list):
        n_batch = i + 1
        batch_idx =f"Training Batch {str(n_batch)}/{str(total_batch)}"
        print(f"**** Starting Batch Training: {batch_idx} ****")
        # load train data from the blobs 
        batch_time = ds.time_start_end(
            msg=f"{batch_idx} train data loading from blobs")
            # multi processing - loading from blobs: 
        X, y, sample_weight_vector = get_model_data_multi_processing(
            blobs_list, config, get_sample_weight=True)
            
        ############
        ds.df_info(X, label=f"{batch_idx} feature dataframe")
        ds.time_start_end(started=batch_time, msg=f"{batch_idx} train data loading from blobs")
    
        # train on the data
        batch_train_time = ds.time_start_end(msg=f"{batch_idx} Tuning")
        eval_set = ([(X, y), (test_X, test_y)])
        try: 
            
            tuning_search = get_rand_search_cv(config)
            
            tuning_search.fit(
                X, y,
                eval_set = ([(X, y), (test_X, test_y)]),
                verbose=config['EVAL_VERBOSE'])

            
        except Exception as e:
            msg = "Failed in tuning"
            print(msg, e)
            ds.terminate_prog(msg=msg, error=e)
            
        ds.time_start_end(started=batch_train_time, msg=f"{batch_idx} Tuning")
        ds.time_start_end(started=batch_time, msg=f"{batch_idx}")
    
    logger.info("Hyper-parameters tunning DONE")
    logger.info("Best parameters:" + str(tuning_search.best_params_))
    
    ds.time_start_end(started=main_time, msg="MAIN")
    
    
    ########## results
    best_model = tuning_search.best_estimator_
    best_params = tuning_search.best_params_
    
    # Get the evaluation results of the batch
    eval_results = best_model.evals_result()
    loss_values = eval_results['validation_1']['logloss']  # Change 'rmse' to the appropriate metric used during training
    best_iter = best_model.best_iteration
    y_pred = best_model.predict(test_X)
    print_metrcs_inline(test_y, y_pred, loss_values, best_iter)
    
    # Save best_paramters and upload to blob
    model_id = make_model_id(config, prefix=config["MODEL_ID_PREFIX"])    
    
    params_file_name = 'best_params_' + model_id +'.pkl'
    blob_name = config['ARTIFACT_BLOB'] + '/' + params_file_name
    
    with open(params_file_name, 'wb') as file:
        pickle.dump(best_params, file)
    
    if config['UPLOAD_MODEL']:
        print(f"Uploading the best parameters, {params_file_name} to GCS") 
        bq.upload_file_to_blob(params_file_name, config['BUCKET'], blob_name)
        print(f"uploaded the best parameters to {blob_name}")
    else:
        print("best parameters NOT uploaded to BLOB. Set True in config to upload!")
    
    # Save best_model and upload to blob
    model_file_name = 'best_model_' + model_id +'.pkl'
    blob_name = config['ARTIFACT_BLOB'] + '/' + model_file_name
    
    with open(model_file_name, 'wb') as file:
        pickle.dump(best_model, file)
    
    if config['UPLOAD_MODEL']:
        print(f"Uploading the best model, {model_file_name} to GCS") 
        bq.upload_file_to_blob(model_file_name, config['BUCKET'], blob_name)
        print(f"uploaded the best model to {blob_name}")
    else:
        print("best model NOT uploaded to BLOB. Set True in config to upload!")
    
              
    ## Remove the temp model file
    if config['REMOVE_LOCAL_TEMP_MODEL']:
        os.remove(params_file_name)
        print("Removed local best parameter file")
        os.remove(model_file_name)
        print("Removed local best model file")
    
    
    
    
    

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("config_file_name", help="config file name")
    args = parser.parse_args()
    
    bq = BigQuery()
    
    main(args.config_file_name)
