In [None]:
%%writefile etl.py
import pandas as pd
import os

if __name__ == '__main__':
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
    
    col_names = ['zip_agg_customer_subtype',
                 'zip_agg_number_of_houses',
                 'zip_agg_avg_size_household',
                 'zip_agg_avg_age',
                 'zip_agg_customer_main_type',
                 'zip_agg_roman_catholic',
                 'zip_agg_protestant',
                 'zip_agg_other_religion',
                 'zip_agg_no_religion',
                 'zip_agg_married',
                 'zip_agg_living_together',
                 'zip_agg_other_relation',
                 'zip_agg_singles',
                 'zip_agg_household_without_children',
                 'zip_agg_household_with_children',
                 'zip_agg_high_level_education',
                 'zip_agg_medium_level_education',
                 'zip_agg_lower_level_education',
                 'zip_agg_high_status',
                 'zip_agg_entrepreneur',
                 'zip_agg_farmer',
                 'zip_agg_middle_management',
                 'zip_agg_skilled_labourers',
                 'zip_agg_unskilled_labourers',
                 'zip_agg_social_class_a',
                 'zip_agg_social_class_b1',
                 'zip_agg_social_class_b2',
                 'zip_agg_social_class_c',
                 'zip_agg_social_class_d',
                 'zip_agg_rented_house',
                 'zip_agg_home_owners',
                 'zip_agg_1_car',
                 'zip_agg_2_cars',
                 'zip_agg_no_car',
                 'zip_agg_national_health_service',
                 'zip_agg_private_health_insurance',
                 'zip_agg_income_<_30.000',
                 'zip_agg_income_30-45.000',
                 'zip_agg_income_45-75.000',
                 'zip_agg_income_75-122.000',
                 'zip_agg_income_>123.000',
                 'zip_agg_average_income',
                 'zip_agg_purchasing_power_class',
                 'contri_private_third_party_ins',
                 'contri_third_party_ins_(firms)',
                 'contri_third_party_ins_(agriculture)',
                 'contri_car_policies',
                 'contri_delivery_van_policies',
                 'contri_motorcycle/scooter_policies',
                 'contri_lorry_policies',
                 'contri_trailer_policies',
                 'contri_tractor_policies',
                 'contri_agricultural_machines_policies',
                 'contri_moped_policies',
                 'contri_life_ins',
                 'contri_private_accident_ins_policies',
                 'contri_family_accidents_ins_policies',
                 'contri_disability_ins_policies',
                 'contri_fire_policies',
                 'contri_surfboard_policies',
                 'contri_boat_policies',
                 'contri_bicycle_policies',
                 'contri_property_ins_policies',
                 'contri_ss_ins_policies',
                 'nbr_private_third_party_ins',
                 'nbr_third_party_ins_(firms)',
                 'nbr_third_party_ins_(agriculture)',
                 'nbr_car_policies',
                 'nbr_delivery_van_policies',
                 'nbr_motorcycle/scooter_policies',
                 'nbr_lorry_policies',
                 'nbr_trailer_policies',
                 'nbr_tractor_policies',
                 'nbr_agricultural_machines_policies',
                 'nbr_moped_policies',
                 'nbr_life_ins',
                 'nbr_private_accident_ins_policies',
                 'nbr_family_accidents_ins_policies',
                 'nbr_disability_ins_policies',
                 'nbr_fire_policies',
                 'nbr_surfboard_policies',
                 'nbr_boat_policies',
                 'nbr_bicycle_policies',
                 'nbr_property_ins_policies',
                 'nbr_ss_ins_policies',
                 'nbr_mobile_home_policies']

    train = pd.read_csv(os.path.join(input_path, 'train.csv'))
    test = pd.read_csv(os.path.join(input_path, 'test.csv'))
    ground_truth = pd.read_csv(os.path.join(input_path, 'gt.csv'))
    columns = pd.read_csv(os.path.join(input_path, 'col_info.csv'))

    data_dict = {}
    data_dict['feat_info'] = columns.iloc[1:87, 0].str.split(n=2, expand=True)
    data_dict['feat_info'].columns = columns.iloc[0, 0].split(maxsplit=2)
    data_dict['L0'] = columns.iloc[89:130, 0].str.split(n=1, expand=True)
    data_dict['L0'].columns = columns.iloc[88, 0].split()
    data_dict['L2'] = columns.iloc[138:148, 0].str.split(n=1, expand=True)
    data_dict['L2'].columns = ['Value', 'Bin']

    test_df = pd.concat([test, ground_truth], axis=1)
    test_df.columns = data_dict['feat_info']['Name'].to_list()
    train.columns = data_dict['feat_info']['Name'].to_list()

    df = pd.concat([train, test_df], ignore_index=True)
    df.columns = col_names

    data_dict['L0']['Value'] = pd.to_numeric(data_dict['L0']['Value'])
    l0_dict = data_dict['L0'].set_index('Value').to_dict()['Label']
    data_dict['L2']['Value'] = pd.to_numeric(data_dict['L2']['Value'])
    l2_dict = data_dict['L2'].set_index('Value').to_dict()['Bin']
    df[df.columns[0]] = df[df.columns[0]].replace(l0_dict)
    df[df.columns[4]] = df[df.columns[4]].replace(l2_dict)

    df.to_csv(os.path.join(output_path, 'full_data.csv'), index=False)

In [None]:
%%writefile preprocess.py
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package])
def upgrade(package):
    subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package, '--upgrade'])
    
upgrade('pandas==1.3.5')
upgrade('numpy')
upgrade('pyarrow')
install('category_encoders')

import numpy as np
import pandas as pd
import sys
import os

from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders as ce
from sklearn.pipeline import Pipeline

bucket = session.default_bucket()
prefix = '1_ins_dataset'
    
class OneHotTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self._feature_names = feature_names
    
    def fit(self, ori_df, y=None):
        return self
    
    def transform(self, ori_df, y=None):
        print('Running OneHotTransformer')
        df = ori_df[self._feature_names]
        col_names = df.dropna(axis=1, how='all').columns
        encoder = ce.OneHotEncoder(cols=col_names, use_cat_names=True, handle_missing='return_nan')
        ce_one_hot = pd.DataFrame(encoder.fit_transform(df[col_names]), index=df.index)
        ce_one_hot = ce_one_hot.astype(int)
        df = ori_df.drop(self._feature_names, axis=1).merge(ce_one_hot, left_index=True, right_index=True, how='outer')
        return df
    
    def get_feature_names_out(self):
        return df.columns.tolist()
    
preprocessor = Pipeline([
    ('onehot', OneHotTransformer(cat_cols.keys()))
    ])


if __name__ == '__main__':
    input_path = '/opt/ml/processing/input'
    output_path = '/opt/ml/processing/output'
    
    try:
        os.makedirs(os.path.join(output_path, 'data'))
        os.makedirs(os.path.join(output_path, 'encoder'))
    except:
        pass
    
    print('Reading data')
    df = pd.read_table(input_path + '/ticdata2000.txt', header=None)
    print('Preprocessing data')
    processed_df = pd.DataFrame(preprocessor.fit_transform(df))
    print('Saving dataframe')
    df.to_json(os.path.join(output_path, 'data', 'train_data.json'))
#     print('Saving joblib')
#     joblib.dump(preprocessor, os.path.join(output_path, 'encoder', 'preprocess.joblib'))    

In [None]:
%%writefile evaluate.py
import json
import logging
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import(
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    roc_curve)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="..")
        
    logger.debug("Loading xgboost model")
    model = pickle.load(open("xgboost-model", "rb"))
    
    logger.debug("Loading test input data")
    test_path = "/opt/ml/processing/test/test_feats.csv"
    df = pd.read_csv(test_path)
    
    logger.debug("Reading test data")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)
    
    logger.info("Performing predictions against test data")
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)
    
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)
    
    logger.debug("Accuracy: {}".format(accuracy))
    logger.debug("Precision: {}".format(precision))
    logger.debug("Recall: {}".format(recall))
    logger.debug("Confusion matrix: {}".format(conf_matrix))
    
    report_dict = {
        "binary_classification_metrics": {
            "accuracy": {"value":accuracy, "standard_deviation":"NaN"},
            "precision": {"value":precision, "standard_deviation":"NaN"},
            "recall": {"value":recall, "standard_deviation":"NaN"},
            "confusion_matrix": {"0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                                 "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])}
                                },
            "receiver_operating_charastic_curve": {
                "false_positive_rates": list(fpr),
                "true_positive_rates": list(tpr)
            }
        }
    }
    
    print(report_dict)
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f'{output_dir}/evaluation.json'
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))        

In [None]:
%%writefile evaluate_extd.py
import json
import logging
import pathlib
import pickle
import tarfile
import numpy as np
import pandas as pd
import xgboost

from sklearn.metrics import(
    accuracy_score,
#     precision_score,
#     recall_score,
    confusion_matrix,
    roc_curve,
#     mine
    auc, 
    precision_recall_curve, 
    precision_score,
    average_precision_score,
    roc_auc_score,
    log_loss,
    f1_score,
    recall_score,
    roc_curve,
    make_scorer,
    balanced_accuracy_score,
    cohen_kappa_score,
    matthews_corrcoef,
    fbeta_score)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

if __name__ == "__main__":
    model_path = "/opt/ml/processing/model/model.tar.gz"
    with tarfile.open(model_path) as tar:
        tar.extractall(path="..")
        
    logger.debug("Loading xgboost model")
    model = pickle.load(open("xgboost-model", "rb"))
    
    logger.debug("Loading test input data")
    test_path = "/opt/ml/processing/test/test_feats.csv"
    df = pd.read_csv(test_path)
    
    logger.debug("Reading test data")
    y_test = df.iloc[:, 0].to_numpy()
    df.drop(df.columns[0], axis=1, inplace=True)
    X_test = xgboost.DMatrix(df.values)
    
    logger.info("Performing predictions against test data")
    prediction_probabilities = model.predict(X_test)
    predictions = np.round(prediction_probabilities)
    
#     precision = precision_score(y_test, predictions)
#     recall = recall_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, prediction_probabilities)
    
    auc_score = auc(fpr, tpr) 
    pr_curve = precision_recall_curve(y_test, predictions)
    precision = precision_score(y_test, predictions)
    avg_precision = average_precision_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
#     log_loss_score = log_loss(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    roc = roc_curve(y_test, predictions)
    informedness = balanced_accuracy_score(y_test, predictions, adjusted=True)
    cohen_kappa = cohen_kappa_score(y_test, predictions)
    matthews_coef = matthews_corrcoef(y_test, predictions)
    fbeta = fbeta_score(y_test, predictions, beta=0.5)
        
    logger.debug("Accuracy: {}".format(accuracy))
#     logger.debug("Precision: {}".format(precision))
#     logger.debug("Recall: {}".format(recall))
    logger.debug("Confusion matrix: {}".format(conf_matrix))
    
    logger.debug("AUC: {}".format(auc_score))
    logger.debug("Precision Recall Curve: {}".format(pr_curve))
    logger.debug("Precision: {}".format(precision))
    logger.debug("Average Percision: {}".format(avg_precision))
    logger.debug("ROC AUC: {}".format(roc_auc))
#     logger.debug("Log loss: {}".format(log_loss_score))
    logger.debug("F1: {}".format(f1))
    logger.debug("Recall: {}".format(recall))
    logger.debug("ROC: {}".format(roc))
    logger.debug("informedness: {}".format(informedness))
    logger.debug("Cohen Kappa: {}".format(cohen_kappa))
    logger.debug("Mathews Correlation Coefficient: {}".format(matthews_coef))
    logger.debug("Fbeta: {}".format(fbeta))
    
    report_dict = {
        "binary_classification_metrics": {
            "auc": {"value":auc_score, "standard_deviation":"NaN"},
            "precision": {"value":precision, "standard_deviation":"NaN"},
            "avg_percision": {"value":avg_precision, "standard_deviation":"NaN"},
            "roc_auc": {"value":roc_auc, "standard_deviation":"NaN"},
#             "log_loss": {"value":log_loss_score, "standard_deviation":"NaN"},
            "f1": {"value":f1, "standard_deviation":"NaN"},
            "recall": {"value":recall, "standard_deviation":"NaN"},
#             "roc": {"value":roc, "standard_deviation":"NaN"},
            "informedness": {"value":informedness, "standard_deviation":"NaN"},
            "cohen_kappa": {"value":cohen_kappa, "standard_deviation":"NaN"},
            "mathews_coef": {"value":matthews_coef, "standard_deviation":"NaN"},
            "fbeta": {"value":fbeta, "standard_deviation":"NaN"},
            "accuracy": {"value":accuracy, "standard_deviation":"NaN"},
#             "pr_curve": {"0": {"0": int(pr_curve[0][0]), "1": int(pr_curve[0][1])},
#                          "1": {"0": int(pr_curve[1][0]), "1": int(pr_curve[1][1])},
            "confusion_matrix": {"0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                                 "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])}
                                },
            "receiver_operating_charastic_curve": {
                "false_positive_rates": list(fpr),
                "true_positive_rates": list(tpr)
            }
        }
    }
    
    print(report_dict)
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = f'{output_dir}/evaluation.json'
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))        