In [None]:
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role
from sagemaker.session import s3_input, Session, TrainingInput
from sagemaker.debugger import Rule, rule_configs

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

from functools import partial
from skopt import gp_minimize, space
from xgboost import XGBClassifier

In [None]:
role = get_execution_role()
bucket = "insurancedatatest"

# loading files from S3 Bucket 
TRAINING_FILE = pd.read_csv(f"s3://{bucket}/{'train.csv'}")
TESTING_FILE = pd.read_csv(f"s3://{bucket}/{'test.csv'}")
SAMPLING_FILE = pd.read_csv(f"s3://{bucket}/{'sample_submission.csv'}")

In [None]:
def clean_colname(df):
    # lower case columns, no spaces & dashes
    df.columns = [
        x.lower().replace(" ", "_").replace("-", "_").replace(".", "_")
        for x in df.columns
    ]
    return df.columns


class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)


if __name__ == "__main__":
    # set index and concat train and test
    TESTING_FILE = TESTING_FILE.set_index("id").join(SAMPLING_FILE.set_index("id"))
    TRAINING_FILE = TRAINING_FILE.set_index("id")
    df = pd.concat([TRAINING_FILE, TESTING_FILE])
    
    col = clean_colname(df)
    df.columns = col
    
    # inititate label encode class and fit_transform columns
    df = MultiColumnLabelEncoder(
        columns=["gender", "vehicle_age", "vehicle_damage"]
    ).fit_transform(df)
    if df.isnull().sum().any() == False:
        s3 = boto3.client('s3')
        print("Data is Clean, No Null Values Found")
        df.to_csv('clean_data.csv', index=False)
        with open('clean_data.csv', 'rb') as f:
            s3.upload_fileobj(f, bucket, 'clean_data.csv')
    else:
        print("Found Null Values")


In [None]:
"""
Dataset doesn't have equal distribution of targets values
use Stratified Cross-Validation for Imbalanced Classification, 
maintains the same class distribution in each subset
"""
df = pd.read_csv(f"s3://{bucket}/{'clean_data.csv'}") 
                 
targets = df["response"]
print(
    f"Imbalanced Classification, value for Response [1]: {targets.value_counts()[1] / targets.value_counts()[0]*100:0.2f}%"
)
features = df.drop("response", axis=1).values

kfold = StratifiedKFold(n_splits=5, shuffle=True)

# enumerate the splits
for train_ix, test_ix in kfold.split(features, targets):
    x_train, x_test = features[train_ix], features[test_ix]
    y_train, y_test = targets[train_ix], targets[test_ix]

    # makes sure its even for all k-folds
    train_0, train_1 = len(y_train[y_train == 0]), len(y_train[y_train == 1])
    test_0, test_1 = len(y_test[y_test == 0]), len(y_test[y_test == 1])
    # authenticate if all folds have target values evenly distributed 
    print(f"Train: 0={train_0}, 1={train_1}, Test: 0={test_0}, 1={test_1}")

In [None]:
def optimize(params, param_names, x, y):
    """Takes all arguments from search space and traning features/target
        Initializes the models by setting the chosen param and runs StratifiedKFold
    Args:
        params [dict]: convert params to dict
        param_names [list]: make a list of param names
        x [float]: feature values
        y [int]: target values are binary
    Returns:
        [float]: Returns an accuracy score for 5 Folds
    """
    # set the parameters as dictionaries
    params = dict(zip(param_names, params))

    # initiate XGBClassifier and K-fold (5)
    model = XGBClassifier(objective='binary:logistic', **params)
    kf = StratifiedKFold(n_splits=5)
    
    # create empty list for metric and loop over folds
    acc = []
    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain, xtest = x[train_idx], x[test_idx]
        ytrain, ytest = y[train_idx], y[test_idx]
        
        model.fit(xtrain, ytrain)
        pred = model.predict(xtest)

        # append mean-accuracy to empty list
        fold_accuracy = roc_auc_score(ytest, pred)
        acc.append(fold_accuracy)
    # return negative acc to find max optimization
    return -np.mean(acc)

# import csv file and set as array 
# df = pd.read_csv(config.CLEAN_FILE)
targets = df["response"].values
features = df.drop("response", axis=1).values

# define the range of input values to test the BayesOptimization to create prop-distribution
param_space = [
    space.Integer(4, 24, name="max_depth"),
    space.Integer(1, 9, name="gamma"),
    space.Integer(20, 150, name="reg_alpha"),
    space.Real(0.01, 1, prior="uniform", name="reg_lambda"),
    space.Integer(1, 10, name="min_child_weight"),
    space.Real(0.05, 0.30, prior="uniform", name="eta"),
    space.Real(0.5, 1, prior="uniform", name="colsample_bytree"),
    space.Real(0.6, 0.95, prior="uniform", name="base_score"),
]

param_names = [
    "max_depth",
    "gamma",
    "reg_alpha",
    "reg_lambda",
    "min_child_weight",
    "eta",
    "colsample_bytree",
    "base_score",
]

# define the loss function to minimize (acc will be negative)
optimization_function = partial(
    optimize, param_names=param_names, x=features, y=targets
)

# initiate gp_minimize for Bayesian Optimization to select the best input values
result = gp_minimize(
    optimization_function,
    dimensions=param_space,
    n_calls=10,
    n_random_starts=10,
    verbose=10,
)
print(dict(zip(param_names, result.x)))

In [None]:
TRAINING_FILE = s3_input(f"s3://{bucket}/{'clean_data.csv'}")
TESTING_FILE = s3_input(f"s3://{bucket}/{'test.csv'}")
region = 'us-east-1'
                        
container=sagemaker.image_uris.retrieve('xgboost', 'us-east-1', '1.2-1')

xgb_model=sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m4.xlarge',
    train_volume_size=5,
    sagemaker_session=sagemaker.Session(),
    rules=[Rule.sagemaker(rule_configs.create_xgboost_report())]
)

In [None]:
xgb_model.set_hyperparameters(
        max_depth = 7, 
        eta = 0.24528128897339257,
        gamma = 8.815330570605372,
        min_child_weight = 1,
        subsample = 0.7,
        objective = 'binary:logistic',
        alpha = 150,
        base_score = 0.5074025941621864,
        colsample_bytree = 0.7805503460343888
)

xgb_model.fit({'clean_data.csv': TRAINING_FILE, 'test.csv': TESTING_FILE})