In [1]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), '..')))

import argparse 
import pickle
import joblib

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

import multiprocessing
import random
import time

import numpy as np
import pandas as pd

from utils import ds_general as ds
from utils.BigQuery import BigQuery

In [3]:
def load_blob_list(config, train_or_test='train', verbose = 1):    
    """
    return list of blobs of train or test data
    """
    if train_or_test.lower() =='train':
        blob = config['TRAIN_DATA_BLOB']
        max_files = config['MAX_N_FILES']
    elif train_or_test.lower() == 'test':
        blob = config['TEST_DATA_BLOB']
        max_files = config['TOTAL_FILES_TO_TEST']
    else:
        ds.terminate_prog("select 'train' or 'test' for data blob", error = e)
        
    try: 
        blobs_list = bq.list_blobs(config['BUCKET'], f"{blob}")
        blobs_list = [f"gs://{config['BUCKET']}/" + b for b in blobs_list]

        n_blobs = len(blobs_list) if not max_files \
            else max_files
        print(f"total num of {train_or_test} blobs {len(blobs_list)}")
        print(f"num of the selected {train_or_test} blobs {n_blobs}")
        return blobs_list[:n_blobs]
    
    except Exception as e:
        print(e)
        ds.terminate_prog(f"Failed to load {train_or_test}blob list", error = e)
    

In [4]:
# load data from blob
def load_data_from_blob(blob_path):
    """
    Load model data from a blob.
    Return a data frame
    """
    temp_df = pd.DataFrame()
    result_df = pd.DataFrame()
    try:
        if type(blob_path) == str:
            blob_path = [blob_path]
        for i, blob in enumerate(blob_path):
            print(f"{i+1}th blob loading......")
            temp_df = pd.read_parquet(blob)
            temp_df = temp_df.reset_index(drop=True)
            result_df = result_df.reset_index(drop=True)
            result_df = pd.concat([result_df, temp_df], axis=0)
        
        return result_df
            

    except Exception as e:
        msg = "Failed to load the blob."
        print(msg)
        print(e)
        ds.terminate_prog(msg, error=e)
        

In [5]:
def load_model_from_blob(
    bucket, blob, 
    model_fname = "./tmp/temp_pretrained_xgb_model",
    file_type = 'xgb',
    verbose = 1):
    
    """
    file_type: 
        'xgb', load xgb model saved on blob by xgb_model.save().
        'joblib', load a model saved by joblib
        'pickle',  load a model saved by pickle
    """
    if not blob:
        print("No pretrained model is provided. Continue with a blank model.")
        return None
    
    try:
        bq.download_file_from_blob(
            bucket,
            blob,
            model_fname,
            verbose
        )
        if file_type == 'xgb':
            # model = xgb.Booster()
            model = xgb.XGBClassifier()
            model = model.load_model(model_fname)
        elif file_type == 'joblib':
            model = joblib.load(model_fname)
        elif file_type == 'pickle':
            with open(model_fname, 'rb') as f:
                model = pickle.load(f)

        else:
            logger.critical("file_type: wrong file_type. Should be either 'xgb', 'joblib', or 'pickle'")
            return
        # logger.debug("Succefully loaded a pre-trained model. ")
        logger.info("Succefully loaded a pre-trained model. ")

        return model

    except Exception as e:
        msg = f"Failed to load pretrained xgb model from {blob} \nContinue without a pre-trained model"
        print(msg)
        print(e)
        logger.critical(e)
        return

In [None]:
def feature_eng(df, config):
    """
    feature engineering here
    """
    print("*"*5, "feature engineering...")
    
    # drop any rows with Null data
    print('drop rows with any NA....')
    print(f'df before drop na: {df.shape}')
    df = df.dropna()
    print(f'df afer drop na: {df.shape}')
    
    # drop a certain events
    if config['DROP_EVENTS']:
        exclude_events = config['DROP_EVENTS']
        
        print(f"removing events: {exclude_events}")
        print(f'df before removing events: {df.shape}')
        df = df[~df.event_id.isin(exclude_events)]
        print(f'df after revmoved events: {df.shape}')
        
    if config['LABEL_ENCODER_COLS']:
        for col in config['LABEL_ENCODER_COLS']:
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
        print(f"Encorded string feature to numeric int features: {config['LABEL_ENCODER_COLS']}")
        
    if config['CATEGORICAL_COLS']:
        df[config['CATEGORICAL_COLS']] = df[config['CATEGORICAL_COLS']].astype("category")
        print(f"Converted numeric features to categorical features: {config['CATEGORICAL_COLS']}")
    
    print("*"*5, "feature engineering DONE")
    
    return df
    

In [6]:
def make_train_data(df, config, feature_modification, get_sample_weight = False):
    """
    Convert raw dataframe to X, y train data.
    Sample weight vectors of the length of the data is also returned if set True.
    Include data cleaning.
    
    Return: (X, y, sample_weight_vector)
        X: features panda dataframe (n_rows, n_cols)
        y: panda series (n_rows, )
        sample_weight_vector: numpy array (n_rows, )
        
    """
    result = ()
    cols_to_drop = []
    try:
        
         # light additional feature engineering
        if feature_modification:
            df = feature_eng(df, config)
        
        if get_sample_weight:
            print('Applying multi label weights....')
            print(config['LABEL_MULTI_WEIGHT'])
            sample_weight_vector = df.label_multi.astype(int).map(config['LABEL_MULTI_WEIGHT'])
        else:
            sample_weight_vector = None
            
        for col in config['COLS_TO_DROP']:
            if col in df.columns:
                cols_to_drop.append(col)
        
        print(f"dropping columns before training: {cols_to_drop}")
        df = df.drop(cols_to_drop, axis = 1)
    
        y = df[config['Y_LABEL_COL']].astype(int)
        X = df.drop(config['Y_LABEL_COL'], axis=1)
               
        result = (X, y, sample_weight_vector)
        
    except Exception as e:
        msg = "Failed to make train data from the dataframe!"
        print(msg)
        print(e)
        ds.terminate_prog(msg, error = e)

    
    return result

In [23]:
def make_model_id(config, prefix='model_'):
    if prefix:
        return prefix + "_" + config['MODEL_VERSION'] + str(ds.name_time_id())
    else:
        return config['MODEL_VERSION'] + str(ds.name_time_id())
    

In [8]:
def get_config(f_path, verbose=2):
    """
    Load a config yaml file.
    """
    config = ds.load_config(f_path)
    if verbose > 1:
        print(config)
    elif verbose > 0:
        print(f"config keys: {config.keys()}")
    
    return config

In [9]:
def merge_df_list(df_list):
    if type(df_list) != list:
        raise Exception("ERROR: should be a lsit of df to merge!!!")
    print(f"merging {len(df_list)} dataframes started.....")
    try:
        # result_df = pd.DataFrame()
        # for df in df_list:
        #     result_df = result_df.reset_index(drop=True)
        #     result_df = pd.concat([result_df, df], axis=0)
        
        result_df = pd.concat(df_list)
            
            
        return result_df
    except Exception as e:
        msg = "failed to merge dataframes"
        print(msg)
        print(e)
        ds.terminate_prog(msg=msg)

In [28]:
## Load best parameters
def load_best_params(bucket_name=None, blob_name=None, 
                     param_fname = None,
                     param_dict = None,
                     verbose = 1):
    params = None
    source = None
    try:
        if blob_name:
            param_fname = './tmp/temp_best_params.pkl'
            bq.download_file_from_blob(
                bucket_name,
                blob_name,
                param_fname,
                verbose
            )
            with open(param_fname, 'rb') as file:
                params = pickle.load(file)
                
            source = "BLOB"
        elif param_fname:
            with open(param_fname, 'rb') as file:
                params = pickle.load(file)
                
            source = "FILE"
        elif param_dict:
            params = param_dict
            
            source = "CONFIG/MANUAL"
        else:
            source = "NO SOURCE"
            
    except Exception as e:
        msg = "Failed to load the best_params"
        print(e)
        ds.terminate_prog(msg=msg, error=e)
    
    print(f"Hyper Parameters Loaded from {source}.....")
    if verbose > 0:
        print(params)
        
    return params

In [11]:
def print_metrcs_inline(eval_y, y_pred, loss, best_iter):
    # evaluate predictions
    accuracy = accuracy_score(eval_y, y_pred)
    balanced_accuracy_value = balanced_accuracy_score(eval_y, y_pred)
    f1_value = f1_score(eval_y, y_pred)
    recall_val = recall_score(eval_y, y_pred)
    roc_auc_val = roc_auc_score(eval_y, y_pred)
    
    min_loss = min(loss)
    last_loss = loss[-1]
    epochs = len(loss)

    print(
        "|Accu: %.2f%%" % (accuracy * 100.0), \
        "|BalAccu: %.2f%%" %(balanced_accuracy_value * 100.0), \
        "|ROCAUC: %.3f" %(roc_auc_val), \
        "|RecallPos: %.3f" %(recall_val), \
        "|F1Pos: %.3f" %(f1_value), \
        "|Epochs: %.0i" %(epochs), \
        "|BestIterNum: %.0i" %(best_iter), \
        "|MinLoss: %.3f" %(min_loss), \
        "|LastLoss: %.3f" %(last_loss) \
    )

In [12]:
# xgb model trainiing

def xgb_train(
    train_X, train_y, params,
    eval_set_list=None,
    sample_weight_vector = None,
    trained_xbg_model = None,
    verbose = 1
):
    """
    parameters:
        params: hyperparameter dictionary, None for default parameter value
        objective_func: optimization objective function, 'binary:logistic', ''
        tree_method: 'auto', 'approx', 'hist', 'gpu_hist'
    Best model out of the validation: 
        When using cross-validation with XGBoost's fit method, 
        the model returned will be the one 
        that achieves the best performance on the validation set 
        during the cross-validation process. 
        It will not be the last model trained.
        Using cross-validation with XGBoost helps
        to improve the robustness of your model 
        by reducing overfitting and giving you an estimate 
        of the model's performance on unseen data.
        
    """

    model = XGBClassifier(
        **params,
        seed = 123)
    
    model.fit(
        train_X, train_y,
        eval_set=eval_set_list,
        sample_weight = sample_weight_vector,
        xgb_model=trained_xbg_model,
        verbose = verbose
    )
    
    return model

In [13]:
def get_model_data_multi_processing(blobs_list, config, feature_modification, get_sample_weight = False):
    """
    Multi processing of loading model data including data cleaning.
    Return: outputs of make_train_data()
        
    """
    try: 
        print(f"Data loading total {len(blobs_list)} blobs starting {blobs_list[0]}...")
        
        splited_loading_list = ds.split_list_n_size(
                blobs_list, 
                config['N_FILES_EACH_LOADING'])
        df_list = []
        
        p = multiprocessing.Pool(config['N_CPUS'])
        pool_results = p.map(
            func=load_data_from_blob, 
            iterable=splited_loading_list)
        p.close()
        p.join()
        
        data_df = merge_df_list(pool_results)

        results = make_train_data(
            df=data_df,
            config=config,
            get_sample_weight=get_sample_weight, 
            feature_modification=feature_modification)
        
        return results
        
    except Exception as e:
        msng = f"Failed in geting model data by multi processing starting {blobs_list[0]}."
        print(e)
        ds.terminate_prog(msg=msg, error=e)
    

In [14]:
def main(config_file_name):
    logger = ds.get_logger(level='INFO')
    
    ds.make_folder('tmp')
    
    main_time = ds.time_start_end(msg="MAIN")
    # load configuration from a yaml file
    config = get_config(config_file_name)
    
    # model id and temp local model file name
    # The model is saved in an XGBoost internal format 
    # which is universal among the various XGBoost interfaces. 
    # Auxiliary attributes of the Python Booster object (such as feature_names) 
    # will not be saved when using binary format. 
    # To save those attributes, use JSON/UBJ instead.
    model_id = make_model_id(config, prefix=config["MODEL_ID_PREFIX"])
    temp_model_fname = f'./tmp/temp_training_model_save_{model_id}.json'
    
    # print note if exist
    if config['NOTE']:
        print("#"*40)
        print(f"## NOTE: {config['NOTE']}")
        print("#"*40)
    
    # load a base model
    trained_model = load_model_from_blob(
        bucket=config['BUCKET'], 
        blob=config['PREVIOUS_MODEL_BLOB'], 
        file_type=config['PREVIOUS_MODEL_TYPE'])
    
    # load parameters
    params = load_best_params(
        bucket_name=config['BUCKET'],
        blob_name=config['HYPER_PARAMETERS_BLOB'],
        param_dict=config['PARAMS']
    )
    
    overriding_params = config['OVERRIDE_PARAMS']
    
    is_overriding_params = False
    
    if overriding_params:
        for param_name, param_value in overriding_params.items():
            params[param_name] = param_value
            is_overriding_params = True
            
    print(f"overriding parameters: {is_overriding_params}")
    print(f"Final training parameters: {params}")
    
    # load list of blobs
    # actual train test will be loaded in each batch process
    blobs_list = load_blob_list(config, train_or_test='train')
    
    #############################
    # load test blobs and data
    #############################
    test_data_time = ds.time_start_end(msg="test_data_load")
    test_blobs_list = load_blob_list(config, train_or_test='test')
    test_X, test_y, test_sample_weight_vector = get_model_data_multi_processing(
        test_blobs_list, config, 
        get_sample_weight=False, 
        feature_modification = config['FEATURE_MODIFICATION'])
    
    ds.df_info(test_X, label="test data feature dataframe")
    ds.time_start_end(started=test_data_time, msg="test_data_load")
    
    
    #############################
    # Train in Bath processing in Serial: 
    # continuous Training batch by batch
    #############################
    split_batch_blob_list = ds.split_list_n_size(
        blobs_list, 
        config['N_FILES_IN_BATCH'])
    total_batch = len(split_batch_blob_list)
    
    # Iterate each batch
    for i, blobs_list in enumerate(split_batch_blob_list):
        n_batch = i + 1
        batch_idx =f"Training Batch {str(n_batch)}/{str(total_batch)}"
        print(f"**** Starting Batch Training: {batch_idx} ****")
        #############################
        # load train data from the blobs 
        #############################
        batch_time = ds.time_start_end(
            msg=f"{batch_idx} train data loading from blobs")
            # multi processing - loading from blobs: 
        X, y, sample_weight_vector = get_model_data_multi_processing(
            blobs_list, config, 
            get_sample_weight=config['USE_SAMPLE_WEIGHT'], 
            feature_modification = config['FEATURE_MODIFICATION'])
            
        ############
        ds.df_info(X, label=f"{batch_idx} feature dataframe")
        ds.time_start_end(started=batch_time, msg=f"{batch_idx} train data loading from blobs")
    
        #############################
        # train on the data
        #############################
        batch_train_time = ds.time_start_end(msg=f"{batch_idx} Training")

        # eval_set = ([(X, y), (test_X, test_y)])
        # test dat as eval_set
        eval_set = ([(test_X, test_y), (X, y)])
        
        try: 
            model = xgb_train(
                train_X = X, 
                train_y = y, 
                eval_set_list = eval_set, 
                params = params,
                sample_weight_vector = sample_weight_vector,
                trained_xbg_model = trained_model,
                verbose = config['N_VALID_OUTPUT'])
            
        except Exception as e:
            msg = "Failed in training"
            print(msg, e)
            ds.terminate_prog(msg=msg, error=e)
        
        #############################
        # Get the evaluation results of the batch
        #############################
        eval_results = model.evals_result()
        loss_values = eval_results['validation_1']['logloss']  # Change 'rmse' to the appropriate metric used during training
        best_iter = model.best_iteration
        y_pred = model.predict(test_X)
        print_metrcs_inline(test_y, y_pred, loss_values, best_iter)
        
        #############################
        # update trained model with the current batch trained model
        #############################
        try:
            # trained_model = xgb.Booster()
            trained_model = xgb.XGBClassifier()
            model.save_model(temp_model_fname)
            trained_model.load_model(temp_model_fname)
            print(f"Saved the trained model of the batch: {temp_model_fname}")
            
        except Exception as e:
            msg= f"Failed to save the trained model of {batch_idx}."
            print(msg, e)
            ds.terminate_prog(msg, e)
            
        ds.time_start_end(started=batch_train_time, msg=f"{batch_idx} Training")
        ds.time_start_end(started=batch_time, msg=f"{batch_idx}")
    
    
    ds.time_start_end(started=main_time, msg="MAIN")
    
    #############################
    # Save best_model and upload to blob
    #############################
    file_name = 'trained_model_' + model_id +'.pickle'
    blob_name = config['ARTIFACT_BLOB'] + '/' + file_name

    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
    
    if config['UPLOAD_MODEL']:
        print(f"Uploading the trained model, {file_name} to GCS") 
        bq.upload_file_to_blob(file_name, config['BUCKET'], blob_name)
        print(f"uploaded the trained model to {blob_name}")
    else:
        print("NOT uploaded to BLOB. Set True in config to upload!")
              
    ## Remove the local model file
    if config['REMOVE_LOCAL_TEMP_MODEL']:
        os.remove(file_name)
        print("Removed local model file")
    
    
    
    
    

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("config_file_name", help="config file name")
    args = parser.parse_args()
    
    bq = BigQuery()
    
    main(args.config_file_name)
