# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import log_loss, roc_auc_score, plot_roc_curve, plot_confusion_matrix, confusion_matrix, make_scorer
from sklearn.metrics import balanced_accuracy_score, dcg_score, roc_auc_score, average_precision_score, roc_curve

import xgboost as xgb

import seaborn as sns

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

#from costcla.metrics import cost_loss

from features import update_dataset_features, text_to_binary, add_extra_features, encode
from eval_metrics import savings

pd.set_option("display.max_columns",500)
plt.style.use('ggplot')

In [None]:
data_path = r"./train.csv"
image_dir = r"./images/"

In [None]:
df = pd.read_csv(data_path, sep=";", encoding="utf-8-sig")

In [None]:
#df["claim_date_occured"] = pd.to_datetime(df["claim_date_occured"], format="%Y%m%d")
#min(df["claim_date_occured"].dt.year - df["policy_holder_year_birth"])

# Exploratory Data Analysis (EDA)

In [None]:
# check imbalance
df["fraud"].replace({"N":0, "Y":1}).sum() / len(df["fraud"]) * 100

In [None]:
pd.crosstab(df["fraud"], df["claim_vehicle_brand"], normalize=True)

In [None]:
# graph for claim_date_occurred
df["claim_date_occured"].plot(kind='hist', logy=True)
plt.ylabel('log(Frequency)')
plt.xlabel('Year')
plt.savefig(image_dir + 'claim_date_occurred.png')

In [None]:
# graph for claim_cause
df["claim_cause"].value_counts().plot(kind='bar')
plt.ylabel("Frequency")
plt.savefig(image_dir + 'claim_cause.png')

In [None]:
postal_bins = [999, 1299, 1499, 1999, 2999, 3499, 3999, 4999, 5999, 6599, 6999, 7999, 8999, 9999]
postal_label = ["brussel", "waals_brabant", "vlaams_brabant", 'Antwerpen', 'vlaams_brabant', 'limburg', 
                'luik', 'namen', 'henegouwen', 'luxemburg', 'henegouwen', 'w-vlaanderen', 'o-vlaanderen']
df["province"] = pd.cut(df["claim_postal_code"], postal_bins, labels=postal_label, ordered=False)

In [None]:
pd.crosstab(df["province"], df["fraud"], normalize=True).plot.bar()
plt.yscale('log')
plt.ylabel('log(Frequency)')
plt.savefig(image_dir + 'province.png')

In [None]:
for i in df.columns:
    print(i)
    try:
        df[i].plot(kind="hist")
        plt.show()
        print(df[i].describe())
    except TypeError:
        values = df[i].value_counts()
        if len(values) < 10:
            values.plot(kind="bar")
            plt.show()
            print(df[i].describe())
        else:
            print('*******too many values to plot*******************')
            print(df[i].describe())
    print('*************************************************************************')

In [None]:
corr[['fraud']][:int(np.round(len(corr)/2))]

In [None]:
corr = df.corr()
corr=corr[['fraud']]
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
mask = ((mask + np.array(np.abs(corr) < 0.1)) > 0).astype(int)

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(30, 30))
    cmap=sns.diverging_palette(220, 20, as_cmap=True)
    ax = sns.heatmap(corr[["fraud"]], mask=mask, square=True, 
                     cmap="Spectral", annot=True)

# Data cleaning

In [None]:
df, ohe = update_dataset_features(df)

In [None]:
df.isna().sum()

# Train test split + prep

In [None]:
# claim_amount is currently dropped since poor performance
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:].drop(columns=["claim_amount"]), df["fraud"], test_size=.2, random_state=96)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df.drop(columns=["fraud", "claim_amount"]),
                                                                    df["claim_amount"], test_size=.2, random_state=96)

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
train_lang_mode = X_train["claim_language"].mode()[0]
train_vtype_mode = X_train["claim_vehicle_type"].mode()[0]

# mean
train_premium_mean = X_train["policy_premium_100"].mean()
train_coverage_mean = X_train["policy_coverage_1000"].mean()
train_policy_holder_mean_age = X_train["policy_holder_age"].mean()

for train_set in [X_train, X_train_reg]:
    train_set["claim_language"].fillna(train_lang_mode, inplace=True)
    train_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)
    train_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
    train_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
    train_set["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
# impute remaining missing values with mode or mean from train set on test set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

for test_set in [X_test, X_test_reg]:
    # mode
    test_set["claim_language"].fillna(train_lang_mode, inplace=True)
    test_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

    # mean
    test_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
    test_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
    X_test["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

# Defining eval metrics

In [None]:
def lift_score(y, y_prob):
    top_100_ranked_TP = pd.DataFrame(y_prob, y).reset_index().sort_values(by=0, ascending=False).iloc[:100]["fraud"].sum()
    avg_fraud_rate = y_train.sample(n=100, random_state=96).sum()
    lift = top_100_ranked_TP/avg_fraud_rate
    return lift

In [None]:
def rr_at_100(y, y_prob):
    top_100_ranked_TP = pd.DataFrame(y_prob, y).reset_index().sort_values(by=0, ascending=False).iloc[:100]["fraud"].sum()
    avg_fraud_rate = y_train.sample(n=100, random_state=96).sum()
    lift = top_100_ranked_TP/avg_fraud_rate
    return lift

In [None]:
cost_mat_train[:,1]

In [None]:
# set costs
cost_FN_train = df.iloc[X_train.index]["claim_amount"]
cost_FN_test = df.iloc[X_test.index]["claim_amount"]
c_FP = 5
c_TP = 5
c_TN = 0
cost_mat_train = np.array([c_FP * np.ones(X_train.shape[0]), cost_FN_train, 
                         c_TP * np.ones(X_train.shape[0]), 
                         c_TN * np.ones(X_train.shape[0])]).T

cost_mat_test = np.array([c_FP * np.ones(X_test.shape[0]), cost_FN_test, 
                         c_TP * np.ones(X_test.shape[0]), 
                         c_TN * np.ones(X_test.shape[0])]).T

In [None]:
# make different custom scorers
pr_score = make_scorer(average_precision_score, greater_is_better=True, 
                       needs_proba=True, average='weighted')
lift_scorer = make_scorer(lift_score, greater_is_better=True, needs_proba=True)

In [None]:
def model_evaluation(label, model, save_path=image_dir,
                     X_train=X_train, X_test=X_test, 
                     y_train=y_train, y_test=y_test, 
                     cost_mat_train=cost_mat_train, 
                     cost_mat_test=cost_mat_test):
    out = {}
    # auc
    yproba_train = model.predict_proba(X_train)[:,1]
    out["fpr_train"], out["tpr_train"], _ = roc_curve(y_train,  yproba_train)
    out["auc_train"] = roc_auc_score(y_train, yproba_train)
    yproba_test = model.predict_proba(X_test)[:,1]
    out["fpr_test"], out["tpr_test"], _ = roc_curve(y_test,  yproba_test)
    out["auc_test"] = roc_auc_score(y_test, yproba_test)
 
    # model savings
    try:
        out["savings_train"], out["savings_test"] = savings(X_train, y_train, 
                                                        X_test, y_test,
                                                        cost_mat_train, cost_mat_test,
                                                        model)
    except:
        out["savings_train"] = np.nan
        _, out["savings_test"] = savings(X_train, y_train, 
                                         X_test, y_test,
                                         np.zeros_like(X_train), cost_mat_test,
                                         model)
    # average_precision
    out["pr_train"] = pr_score(model, X_train, y_train)
    out["pr_test"] = pr_score(model, X_test, y_test)
    
    # lift
    out["lift_train"] = lift_scorer(model, X_train, y_train)
    out["lift_test"] = lift_scorer(model, X_test, y_test)
    
    # plot auc and save
    fig = plt.figure(figsize=(8,6))
    auc = np.round(out["auc_train"],2)
    plt.plot(out["fpr_train"], out['tpr_train'],
            label=f"{label} - train, AUC={auc}")
    auc = np.round(out["auc_test"],2)
    plt.plot(out["fpr_test"], out['tpr_test'],
            label=f"{label} - test, AUC={auc}")
    plt.xlabel("False Positive Rate", fontsize=15)
    plt.ylabel("True Positive Rate", fontsize=15)
    plt.legend(prop={'size':13}, loc='lower right')
    fig.savefig(image_dir + label + '_roc.png')
    plt.show()
    return out

In [None]:
xgb_df = {'subsample': 0.8,
 'scale_pos_weight': 43.767857142857146,
 'reg_lambda': 3,
 'reg_alpha': 0.1,
 'n_estimators': 200,
 'min_child_weight': 20,
 'max_depth': 6,
 'learning_rate': 0.05,
 'gamma': 18,
 'colsample_bytree': 1.0,
 'base_score': 0.45999999999999996}

In [None]:
xgb_df = pd.DataFrame(xgb_df, index=[0])
print(xgb_df.to_latex(index=False))


In [None]:
def create_latex_overview(final_out):
    df = pd.DataFrame()
    for model in final_out.keys():
        final_out[model]['model'] = model
        var_eval = ['model', 'savings_train', 'savings_test', 'pr_train', 'pr_test', 'lift_train', 'lift_test']
        df = df.append(pd.DataFrame({key: final_out[model][key] for key in final_out[model].keys() if key in var_eval}, index=[0]))
    print(df.to_latex(index=False))

In [None]:
final_out = {}

# Predicting claim amount - DO NOT USE

## Linear regression model

In [None]:
lm = LinearRegression().fit(X_train_reg, y_train_reg)

In [None]:
print("Training set R squared: {}".format(lm.score(X_train_reg, y_train_reg)))
print("Testing set R squared: {}".format(lm.score(X_test_reg, y_test_reg)))

In [None]:
print('intercept:', lm.intercept_)
print('coef:', lm.coef_)

## Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators=500, max_depth=5, n_jobs=2, max_features="sqrt", 
                           random_state=96, warm_start=True, bootstrap=True)
model2 = rf.fit(X_train_reg, y_train_reg)

In [None]:
print("Training set R squared: {}".format(model2.score(X_train_reg, y_train_reg)))
print("Testing set R squared: {}".format(model2.score(X_test_reg, y_test_reg)))

# Base Logistic Regression model - off the shelf

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_scaled, y_train)

In [None]:
# we need to predict claim_amount on the testset to evaluate true performance
#X_test.drop(columns=["claim_amount"], inplace=True)
#X_test["claim_amount"] = model2.predict(X_test)

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
model_name = 'LogisticRegression'
final_out[model_name] = model_evaluation(model_name, clf, 
                        X_train=X_train_scaled, X_test=X_test_scaled)

In [None]:
plot_confusion_matrix(clf, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Logistic Regression model - SMOTE

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_scaled_resampled, y_train_resampled = pipeline.fit_resample(X_train_scaled, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
clf_resampled = LogisticRegression(max_iter=500)
clf_resampled.fit(X_train_scaled_resampled, y_train_resampled)

In [None]:
X_test_scaled = scaler.transform(X_test)
model_name = 'LogisticRegression - SMOTE'
# this will throw an error due to the fact that the cost matrix for train is 0
# and we divide by 0
final_out[model_name] = model_evaluation(model_name, clf_resampled, 
                                         X_train=X_train_scaled_resampled,
                                         y_train=y_train_resampled,
                                         X_test=X_test_scaled)

In [None]:
plot_confusion_matrix(clf_resampled, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Random Forest - SMOTE (not in report)

In [None]:
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.6)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=20, class_weight='balanced_subsample', random_state=9, n_jobs=-1)
rf.fit(X_train_resampled, y_train_resampled)

In [None]:
plot_roc_curve(rf, X_test, y_test)

In [None]:
plot_confusion_matrix(rf, X_test, y_test)
plt.grid(False)
plt.show()

# Balanced random forest

See [here](https://imbalanced-learn.org/dev/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html#imblearn.ensemble.BalancedRandomForestClassifier)

In [None]:
over = SMOTE(sampling_strategy=0.4)
steps = [('o', over)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
# 0.60 savings
bclf = BalancedRandomForestClassifier(n_estimators=10000, max_depth=5, random_state=9, 
                                      n_jobs=-1, class_weight="balanced_subsample")

In [None]:
bclf.fit(X_train_resampled, y_train_resampled)

In [None]:
model_name = 'BalancedRandomForest'
# this will throw an error due to the fact that the cost matrix for train is 0
# and we divide by 0
final_out[model_name] = model_evaluation(model_name, bclf,
                                         X_train=X_train_resampled,
                                         X_test=X_test,
                                         y_train=y_train_resampled)

In [None]:
plot_confusion_matrix(bclf, X_test, y_test)
plt.grid(False)
plt.show()

# XGB

see [here](https://xgboost.readthedocs.io/en/latest/parameter.html) for information on all parameters <br />
see [here](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py) and [here](https://github.com/tqchen/xgboost/tree/master/demo) for example of custom obj. function

In [None]:
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
ratio = neg_length/pos_length

In [None]:
# See https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html
# If you care only about the overall performance metric (AUC) of your prediction:
xgb_clf = xgb.XGBClassifier(n_estimators = 500, objective="binary:logistic", base_score=.99, eval_metric="logloss", seed=9, 
                            scale_pos_weight=1)

In [None]:
# If you care about predicting the right probability:
# ^---- this is what we should do (IMO) --> 0.69 savings
xgb_clf = xgb.XGBClassifier(n_estimators = 100, objective="binary:logistic", eval_metric="auc", seed=9,
                            scale_pos_weight=ratio/5, reg_alpha=0.1, reg_lambda=3)

In [None]:
X_train.iloc[:,-9:] = X_train.iloc[:,-9:].astype(float)

In [None]:
# create a weighted claim_amount
df["claim_amount_weighted"] = [df.loc[i, "claim_amount"] * 1.3  if df.loc[i, 'fraud'] == 1 else df.loc[i, "claim_amount"] * 1 for i in df.index]

In [None]:
# calculate cost based on weighted claim amount or unweighted
cost = df.iloc[X_train.index]["claim_amount"]

In [None]:
# sample-weighted XGB
xgb_clf = xgb.XGBClassifier(n_estimators = 100, objective="binary:logistic", eval_metric='auc', min_child_weight=1, seed=9,
                            sample_weight=cost)

In [None]:
xgb_clf.fit(np.asarray(X_train), y_train)

In [None]:
# set grid params
params = {
        'n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
        'learning_rate': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'min_child_weight': [1, 3, 5, 7, 10, 20, 30],
        'reg_lambda': [0.5, 1, 1.5, 2, 3, 4, 5],
        'subsample': [0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'scale_pos_weight': [ratio/i for i in np.arange(1,11)] + [1],
        'reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3],
        'base_score': list(np.arange(0.1,1,0.09)),
        'gamma': list(np.arange(0,22,3))
        }

In [None]:
xgb_clf2 = xgb.XGBClassifier(objective="binary:logistic", seed=9)

In [None]:
# make grid search pipeline
over = SMOTE(sampling_strategy=0.4, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
pipeline = make_pipeline(over, under, xgb_clf2)

In [None]:
# rename params
new_params = {'xgbclassifier__' + key: params[key] for key in params}

In [None]:
# apply randomized grid search CV
cv = StratifiedKFold(n_splits=5, shuffle=True)
random_search = RandomizedSearchCV(pipeline, param_distributions=new_params, n_iter=600, 
                                   scoring={'lift':lift_scorer, 'auc':'roc_auc', 'pr':pr_score, 'savings':savings}, n_jobs=-1, cv=cv, 
                                   verbose=3, random_state=49, refit="lift")

In [None]:
# fit random search cv
xgb_search = random_search.fit(np.asarray(X_train), y_train)

In [None]:
# print best params
xgb_search.best_params_

In [None]:
# print results of cv
pd.DataFrame(xgb_search.cv_results_)

In [None]:
# save restuls in json
import json
with open('output_pr.json', 'w+') as f:
    json.dump(xgb_search.best_params_, f)

In [None]:
xgb_clf = xgb.XGBClassifier(objective="binary:logistic", subsample= 0.8,
                            scale_pos_weight=43.767857142857146,
                            reg_lambda=3,
                            reg_alpha=0.1,
                            n_estimators=200,
                            min_child_weight=20,
                            max_depth=6,
                            learning_rate=0.05,
                            gamma=18,
                            colsample_bytree=1.0,
                            base_score=0.45999999999999996)

In [None]:
xgb_clf.fit(np.asarray(X_train_resampled), y_train_resampled)

In [None]:
plot_roc_curve(xgb_clf, np.asarray(X_train), y_train)
plt.show()

In [None]:
plot_confusion_matrix(xgb_clf, np.asarray(X_train), y_train)
plt.grid(False)
plt.show()

In [None]:
X_test.iloc[:,-9:] = X_test.iloc[:,-9:].astype(float)

In [None]:
plot_roc_curve(xgb_search, np.asarray(X_test), y_test)

In [None]:
plot_confusion_matrix(xgb_search, np.asarray(X_test), y_test)
plt.grid(False)
plt.show()

In [None]:
model_name = 'XGB'
# this will throw an error due to the fact that the cost matrix for train is 0
# and we divide by 0
final_out[model_name] = model_evaluation(model_name, xgb_clf,
                                         X_train=np.asarray(X_train_resampled),
                                         X_test=np.asarray(X_test),
                                         y_train=y_train_resampled)

# Overview


In [None]:
create_latex_overview(final_out)

# Submission

In [None]:
submit_path = r"./test.csv"

In [None]:
submit_set = pd.read_csv(submit_path, sep=";", encoding="utf-8-sig")

In [None]:
sorted(submit_set["claim_date_registered"])

In [None]:
# convert binary text variables into binary: {"Y":1, "N":0}
for i in ["claim_liable", "claim_police", "driver_injured"]:
    text_to_binary(i, "Y", "N", submit_set)
# {"P":1, "N":0}
text_to_binary("claim_alcohol", "P", "N", submit_set)
# {"car":1, "van":0}
text_to_binary("claim_vehicle_type", "car", "van", submit_set)
# {"M":1, "F":0}
text_to_binary("policy_holder_form", "M", "F", submit_set)
# {"B":1, "N":0}
text_to_binary("policy_holder_country", "B", "N", submit_set)
# make claim_lang binary (currently 1:Dutch, 2:Fr) -> 0: Dutch and 1: French
submit_set["claim_language"] = submit_set["claim_language"] - 1 

# add buckets for vehicle power
submit_set["vpower_buckets"] = pd.qcut(submit_set["claim_vehicle_power"], 5)

# add provinces based on postal code
postal_bins = [999, 1299, 1499, 1999, 2999, 3499, 3999, 4999, 5999, 6599, 6999, 7999, 8999, 9999]
postal_label = ["brussel", "waals_brabant", "vlaams_brabant", 'Antwerpen', 'vlaams_brabant', 'limburg', 
                'luik', 'namen', 'henegouwen', 'luxemburg', 'henegouwen', 'w-vlaanderen', 'o-vlaanderen']
submit_set["province"] = pd.cut(submit_set["claim_postal_code"], postal_bins, labels=postal_label, ordered=False)

# add feature that describes if policy holders postal code is same as claim postal code
submit_set["diff_postal_code"] = (submit_set["policy_holder_postal_code"] == submit_set["claim_postal_code"]).astype(float)

# get dummies for cat vars
submit_set = encode(ohe, submit_set, ['claim_cause', 'vpower_buckets', 'province'])
#df = encode_ph_postal_code(phpc_ohe, df)

# format date
YYYYMMDD_date_columns = ["claim_date_registered",
                         "claim_date_occured"]
for i in YYYYMMDD_date_columns:
    submit_set[i] = pd.to_datetime(submit_set[i], format="%Y%m%d")

# remove extreme value
submit_set["claim_vehicle_date_inuse"].replace(to_replace=270505.0, value= np.nan, inplace=True)

YYYYMM_columns = ["claim_vehicle_date_inuse", 
                  "policy_date_start",
                  "policy_date_next_expiry",
                  "policy_date_last_renewed"]
for i in YYYYMM_columns:
    submit_set[i] = pd.to_datetime(submit_set[i], format="%Y%m")

In [None]:
# Add the extra features just like we did for the training set
submit_set = add_extra_features(submit_set)

In [None]:
# Hide the claim_id column as index so that it's not used as covariate for the prediction, but we can recover
# it later as we need claim_id in the output .csv file
submit_set = submit_set.set_index('claim_id')
#df = df[X_train.drop(columns=["claim_amount"]).columns]
submit_set = submit_set[X_train.columns]


In [None]:
submit_set.isna().sum()

In [None]:
submit_set.to_csv('submit.csv')

In [None]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'Column2': "example_value",
            'claim_liable': "0",
            'claim_num_injured': "0",
            'claim_num_third_parties': "0",
            'claim_num_vehicles': "0",
            'claim_police': "0",
            'claim_language': "0",
            'claim_vehicle_type': "0",
            'policy_holder_form': "0",
            'policy_holder_country': "0",
            'policy_num_changes': "0",
            'policy_num_claims': "0",
            'policy_premium_100': "0",
            'policy_coverage_1000': "0",
            'diff_postal_code': "0",
            'claim_cause_animal': "0",
            'claim_cause_fire': "0",
            'claim_cause_other': "0",
            'claim_cause_theft': "0",
            'claim_cause_traffic accident': "0",
            'claim_cause_vandalism': "0",
            'claim_cause_weather': "0",
            'claim_cause_windows': "0",
            'vpower_buckets_(-0.001, 55.0]': "0",
            'vpower_buckets_(55.0, 66.0]': "0",
            'vpower_buckets_(66.0, 81.0]': "0",
            'vpower_buckets_(81.0, 100.0]': "0",
            'vpower_buckets_(100.0, 426.0]': "0",
            'vpower_buckets_nan': "0",
            'province_Antwerpen': "0",
            'province_brussel': "0",
            'province_henegouwen': "0",
            'province_limburg': "0",
            'province_luik': "0",
            'province_luxemburg': "0",
            'province_namen': "0",
            'province_o-vlaanderen': "0",
            'province_vlaams_brabant': "0",
            'province_w-vlaanderen': "0",
            'province_waals_brabant': "1",
            'claim_vehicle_id_count': "0",
            'policy_holder_id_count': "0",
            'driver_id_count': "0",
            'driver_vehicle_id_count': "0",
            'third_party_1_id_count': "0",
            'third_party_1_vehicle_id_count': "0",
            'blacklisted_expert_id': "false",
            'policy_holder_age': "0",
            'pct1': "0",
            'pct2': "0",
            'pct3': "0",
            'pct4': "0",
            'pct5': "0",
            'pct6': "0",
            'pct7': "0",
            'pct8': "0",
            'pct9': "0",
        },
    ],
}

body = str.encode(json.dumps(data))

url = 'http://be62a776-8b23-41b0-a6d2-0f61a6e64323.francecentral.azurecontainer.io/score'
api_key = '' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
submit_set["claim_language"].fillna(train_lang_mode, inplace=True)
submit_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

# mean
submit_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
submit_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
submit_set["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
assert submit_set.isna().sum().sum() == 0

In [None]:
# While we don't have a model yet to predict claim_amount, set it to 0.0
# TODO: Replace this with the predicted response variable of the regression model on the submission set.
#df['claim_amount'] = model2.predict(df)

In [None]:
submit_scaled = scaler.transform(submit_set)

In [None]:
submit_not_scaled = submit_set.copy()

In [None]:
# final submission set initialization
submission = submit_set.reset_index()[['claim_id']]

In [None]:
# for logistic regression
submission["prediction"] = clf.predict_proba(submit_scaled)[:,1]

In [None]:
# for logistic regression with SMOTE
submission["prediction"] = clf_resampled.predict_proba(submit_scaled)[:,1]

In [None]:
# for rf with SMOTE
submission["prediction"] = rf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for balanced random forest
submission["prediction"] = bclf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["prediction"] = xgb_clf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for cost-sensitive xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["prediction"] = xgb_clf.predict_proba(np.asarray(submit_not_scaled))[:,1]

In [None]:
# for cost-sensitive ann
submission["prediction"] = clf.predict(np.asarray(submit_not_scaled).astype(np.float32), verbose=1)

In [None]:
# for randomsearch xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["PROB"] = xgb_search.predict_proba(np.asarray(submit_not_scaled))[:,1]

In [None]:
submission.columns = ["ID", "PROB"]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission_V0.53.csv", sep=',', index=False)