# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import log_loss, roc_auc_score, plot_roc_curve, plot_confusion_matrix, confusion_matrix, make_scorer
from sklearn.metrics import balanced_accuracy_score, dcg_score, roc_auc_score, average_precision_score

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline, make_pipeline

#from costcla.metrics import cost_loss

from features import update_dataset_features, text_to_binary, add_extra_features, encode
from eval_metrics import *

pd.set_option("display.max_columns",500)
plt.style.use('ggplot')

In [None]:
data_path = r"./train.csv"

In [None]:
df = pd.read_csv(data_path, sep=";", encoding="utf-8-sig")

In [None]:
#df["claim_date_occured"] = pd.to_datetime(df["claim_date_occured"], format="%Y%m%d")
#min(df["claim_date_occured"].dt.year - df["policy_holder_year_birth"])

# Exploratory Data Analysis (EDA)

In [None]:
pd.crosstab(df["fraud"], df["claim_vehicle_brand"], normalize=True)

In [None]:
for i in df.columns:
    print(i)
    try:
        df[i].plot(kind="hist")
        plt.show()
        print(df[i].describe())
    except TypeError:
        values = df[i].value_counts()
        if len(values) < 10:
            values.plot(kind="bar")
            plt.show()
            print(df[i].describe())
        else:
            print('*******too many values to plot*******************')
            print(df[i].describe())
    print('*************************************************************************')

# Data cleaning

In [None]:
df, ohe = update_dataset_features(df)

In [None]:
df.isna().sum()

In [None]:
df.to_csv('df_cleaned.csv')

# Train test split + prep

In [None]:
# claim_amount is currently dropped since poor performance
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:].drop(columns=["claim_amount"]), df["fraud"], test_size=.2, random_state=96)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df.drop(columns=["fraud", "claim_amount"]),
                                                                    df["claim_amount"], test_size=.2, random_state=96)

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
train_lang_mode = X_train["claim_language"].mode()[0]
train_vtype_mode = X_train["claim_vehicle_type"].mode()[0]

# mean
train_premium_mean = X_train["policy_premium_100"].mean()
train_coverage_mean = X_train["policy_coverage_1000"].mean()
train_policy_holder_mean_age = X_train["policy_holder_age"].mean()

for train_set in [X_train, X_train_reg]:
    train_set["claim_language"].fillna(train_lang_mode, inplace=True)
    train_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)
    train_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
    train_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
    train_set["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
# impute remaining missing values with mode or mean from train set on test set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

for test_set in [X_test, X_test_reg]:
    # mode
    test_set["claim_language"].fillna(train_lang_mode, inplace=True)
    test_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

    # mean
    test_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
    test_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
    X_test["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

# Predicting claim amount - DO NOT USE

## Linear regression model

In [None]:
lm = LinearRegression().fit(X_train_reg, y_train_reg)

In [None]:
print("Training set R squared: {}".format(lm.score(X_train_reg, y_train_reg)))
print("Testing set R squared: {}".format(lm.score(X_test_reg, y_test_reg)))

In [None]:
print('intercept:', lm.intercept_)
print('coef:', lm.coef_)

## Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators=500, max_depth=5, n_jobs=2, max_features="sqrt", 
                           random_state=96, warm_start=True, bootstrap=True)
model2 = rf.fit(X_train_reg, y_train_reg)

In [None]:
print("Training set R squared: {}".format(model2.score(X_train_reg, y_train_reg)))
print("Testing set R squared: {}".format(model2.score(X_test_reg, y_test_reg)))

# Base Logistic Regression model - off the shelf

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_scaled, y_train)

In [None]:
# we need to predict claim_amount on the testset to evaluate true performance
#X_test.drop(columns=["claim_amount"], inplace=True)
#X_test["claim_amount"] = model2.predict(X_test)

In [None]:
X_test_scaled = scaler.transform(X_test)
plot_roc_curve(clf, X_test_scaled, y_test)

In [None]:
plot_confusion_matrix(clf, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Logistic Regression model - SMOTE

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_scaled_resampled, y_train_resampled = pipeline.fit_resample(X_train_scaled, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
clf_resampled = LogisticRegression(max_iter=500)
clf_resampled.fit(X_train_scaled_resampled, y_train_resampled)

In [None]:
X_test_scaled = scaler.transform(X_test)
plot_roc_curve(clf_resampled, X_test_scaled, y_test)

In [None]:
plot_confusion_matrix(clf_resampled, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Random Forest - SMOTE

In [None]:
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=20, class_weight='balanced_subsample', random_state=9, n_jobs=-1)
rf.fit(X_train_resampled, y_train_resampled)

In [None]:
plot_roc_curve(rf, X_test, y_test)

In [None]:
plot_confusion_matrix(rf, X_test, y_test)
plt.grid(False)
plt.show()

# Balanced random forest

See [here](https://imbalanced-learn.org/dev/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html#imblearn.ensemble.BalancedRandomForestClassifier)

In [None]:
over = SMOTE(sampling_strategy=0.4)
steps = [('o', over)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
# 0.60 savings
bclf = BalancedRandomForestClassifier(n_estimators=10000, max_depth=5, random_state=9, 
                                      n_jobs=-1, class_weight="balanced_subsample")

In [None]:
bclf.fit(X_train_resampled, y_train_resampled)

In [None]:
plot_roc_curve(bclf, X_train_resampled, y_train_resampled)

In [None]:
plot_roc_curve(bclf, X_test, y_test)

In [None]:
plot_confusion_matrix(bclf, X_test, y_test)
plt.grid(False)
plt.show()

In [None]:
cost_train = df.iloc[X_train.index]["claim_amount"]
cost_test = df.iloc[X_test.index]["claim_amount"]

In [None]:
def savings(X_train, y_train, X_test, y_test, cost_mat_train, cost_mat_test, model):
    cost_train = costs(y_train, model.predict(X_train), cost_mat_train)
    max_cost_train = max_costs(y_train, cost_mat_train)
    cost_test = costs(y_test, model.predict(X_test), cost_mat_test)
    max_cost_test = max_costs(y_test, cost_mat_test)
    # max_cost is if we would label everything as negative (i.e. if we don't use our model)
    # costs calculates how much costs our model makes (i.e. we still make costs but less since we catch some frauds)
    # this ratio then shows the savings of our model vs. not using our model at all
    print('train savings: ', np.round((max_cost_train - cost_train) / max_cost_train, 3))
    print('test savings: ', np.round((max_cost_test - cost_test) / max_cost_test, 3))

In [None]:
# savings train
cost_train = eval_metrics.costs(y_train, bclf.predict(X_train), cost_mat_train)
max_cost_train = eval_metrics.max_costs(y_train, cost_mat_train)
cost_test = eval_metrics.costs(y_test, bclf.predict(X_test), cost_mat_test)
max_cost_test = eval_metrics.max_costs(y_test, cost_mat_test)

In [None]:
# max_cost is if we would label everything as negative (i.e. if we don't use our model)
# costs calculates how much costs our model makes (i.e. we still make costs but less since we catch some frauds)
# this ratio then shows the savings of our model vs. not using our model at all
print('train savings: ', np.round((max_cost_train - cost_train) / max_cost_train, 3))
print('test savings: ', np.round((max_cost_test - cost_test) / max_cost_test, 3))

In [None]:
savings(X_train, y_train, X_test, y_test, cost_mat_train, cost_mat_test, bclf)

# XGB

see [here](https://xgboost.readthedocs.io/en/latest/parameter.html) for information on all parameters <br />
see [here](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py) and [here](https://github.com/tqchen/xgboost/tree/master/demo) for example of custom obj. function

In [None]:
import xgboost as xgb

In [None]:
def xgb_cost(y_true, y_pred):
    cost_FN = df.iloc[y_true.index]["claim_amount"]
    c_FP = 50
    c_TP = 50
    c_TN = 0
    cost_mat = np.array([c_FP * np.ones(y_true.shape[0]), cost_FN, 
                             c_TP * np.ones(y_true.shape[0]), 
                             c_TN * np.ones(y_true.shape[0])]).T
    cost_train = costs(y_true, y_pred, cost_mat)
    max_cost = max_costs(y_true, cost_mat)
    savings = (max_cost - cost_train) / max_cost
    return savings

In [None]:
def lift_score(y, y_prob):
    top_100_ranked_TP = pd.DataFrame(y_prob, y).reset_index().sort_values(by=0, ascending=False).iloc[:100]["fraud"].sum()
    avg_fraud_rate = np.asarray(y).sum()/len(y)
    lift = top_100_ranked_TP/avg_fraud_rate
    return lift

In [None]:
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
ratio = neg_length/pos_length

In [None]:
# See https://xgboost.readthedocs.io/en/latest/tutorials/param_tuning.html
# If you care only about the overall performance metric (AUC) of your prediction:
xgb_clf = xgb.XGBClassifier(n_estimators = 500, objective="binary:logistic", base_score=.99, eval_metric="logloss", seed=9, 
                            scale_pos_weight=1)

In [None]:
# If you care about predicting the right probability:
# ^---- this is what we should do (IMO) --> 0.69 savings
xgb_clf = xgb.XGBClassifier(n_estimators = 100, objective="binary:logistic", eval_metric="auc", seed=9,
                            scale_pos_weight=ratio/5, reg_alpha=0.1, reg_lambda=3)

In [None]:
X_train.iloc[:,-9:] = X_train.iloc[:,-9:].astype(float)

In [None]:
# create a weighted claim_amount
df["claim_amount_weighted"] = [df.loc[i, "claim_amount"] * 1.3  if df.loc[i, 'fraud'] == 1 else df.loc[i, "claim_amount"] * 1 for i in df.index]

In [None]:
# calculate cost based on weighted claim amount or unweighted
cost = df.iloc[X_train.index]["claim_amount"]

In [None]:
# sample-weighted XGB
xgb_clf = xgb.XGBClassifier(n_estimators = 100, objective="binary:logistic", eval_metric='auc', min_child_weight=1, seed=9,
                            sample_weight=cost)

In [None]:
xgb_clf.fit(np.asarray(X_train), y_train)

In [None]:
# set grid params
params = {
        'n_estimators': [100, 150, 200, 250, 300, 400, 500, 600],
        'learning_rate': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1],
        'min_child_weight': [1, 3, 5, 7, 10, 20, 30],
        'reg_lambda': [0.5, 1, 1.5, 2, 3, 4, 5],
        'subsample': [0.4, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'scale_pos_weight': [ratio/i for i in np.arange(1,11)] + [1],
        'reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3],
        'base_score': list(np.arange(0.1,1,0.09)),
        'gamma': list(np.arange(0,22,3))
        }

In [None]:
xgb_clf2 = xgb.XGBClassifier(objective="binary:logistic", eval_metric='aucpr', seed=9)

In [None]:
# make grid search pipeline
over = SMOTE(sampling_strategy=0.4, random_state=42)
under = RandomUnderSampler(sampling_strategy=0.6, random_state=42)
pipeline = make_pipeline(over, under, xgb_clf2)

In [None]:
# rename params
new_params = {'xgbclassifier__' + key: params[key] for key in params}

In [None]:
# make different custom scorers
score = make_scorer(balanced_accuracy_score, greater_is_better=True)
dcg_scorer = make_scorer(dcg_score, needs_proba=True)
roc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True, average='weighted', sample_weight=cost)
pr_score = make_scorer(average_precision_score, greater_is_better=True, 
                       needs_proba=True, average='weighted')
lift_scorer = make_scorer(lift_score, greater_is_better=True, needs_proba=True)

In [None]:
# apply randomized grid search CV
cv = StratifiedKFold(n_splits=5, shuffle=True)
random_search = RandomizedSearchCV(pipeline, param_distributions=new_params, n_iter=600, 
                                   scoring=lift_scorer, n_jobs=-1, cv=cv, verbose=3, random_state=49)

In [None]:
# fit random search cv
xgb_search = random_search.fit(np.asarray(X_train), y_train)

In [None]:
# print best params
xgb_search.best_params_

In [None]:
# print results of cv
pd.DataFrame(xgb_search.cv_results_)

In [None]:
# save restuls in json
import json
with open('output_pr.json', 'w+') as f:
    json.dump(xgb_search.best_params_, f)

In [None]:
xgb_clf = xgb.XGBClassifier(objective="binary:logistic", eval_metric='aucpr', subsample= 0.8,
                         scale_pos_weight= 29.17857142857143,
                         reg_lambda= 1.5,
                         reg_alpha= 0.2,
                         n_estimators= 600,
                         min_child_weight= 20,
                         max_depth= 9,
                         learning_rate= 0.01,
                         gamma= 20,
                         colsample_bytree= 0.8,
                         base_score= 0.64,
                           sample_weight=np.asarray(cost))

In [None]:
xgb_clf.fit(np.asarray(X_train_resampled), y_train_resampled)

In [None]:
plot_roc_curve(xgb_clf, np.asarray(X_train), y_train)
plt.show()

In [None]:
plot_confusion_matrix(xgb_clf, np.asarray(X_train), y_train)
plt.grid(False)
plt.show()

In [None]:
X_test.iloc[:,-9:] = X_test.iloc[:,-9:].astype(float)

In [None]:
plot_roc_curve(xgb_search, np.asarray(X_test), y_test)

In [None]:
plot_confusion_matrix(xgb_search, np.asarray(X_test), y_test)
plt.grid(False)
plt.show()

In [None]:
savings(np.asarray(X_train), y_train, np.asarray(X_test), y_test, cost_mat_train, cost_mat_test, xgb_search)

# Set costs

In [None]:
cost_FN_train = df.iloc[X_train.index]["claim_amount"]
cost_FN_test = df.iloc[X_test.index]["claim_amount"]
c_FP = 5
c_TP = 5
c_TN = 0
cost_mat_train = np.array([c_FP * np.ones(X_train.shape[0]), cost_FN_train, 
                         c_TP * np.ones(X_train.shape[0]), 
                         c_TN * np.ones(X_train.shape[0])]).T

cost_mat_test = np.array([c_FP * np.ones(X_test.shape[0]), cost_FN_test, 
                         c_TP * np.ones(X_test.shape[0]), 
                         c_TN * np.ones(X_test.shape[0])]).T

# Cost-sensitive NN

as seen in this [medium post](https://towardsdatascience.com/fraud-detection-with-cost-sensitive-machine-learning-24b8760d35d9)

In [None]:
import keras.backend as K

In [None]:
def create_y_input(y_train, c_FN):
    y_str = pd.Series(y_train).reset_index(drop=True).apply(lambda x: str(int(x)))
    c_FN_str = pd.Series(c_FN).reset_index(drop=True).apply(lambda x: '0' *
                        (5-len(str(int(x)))) + str(int(x)))
    return y_str + '.' + c_FN_str

def custom_loss(c_FP, c_TP, c_TN):
    def loss_function(y_input, y_pred):
        y_true = K.round(y_input)
        c_FN = (y_input - y_true) * 1e5
        eps = 0.0001
        y_pred = K.minimum(1.0 - eps, K.maximum(0.0 + eps, y_pred))
        cost = y_true * (K.log(y_pred) * c_FN + K.log(1 - y_pred) * c_TP)
        cost += (1 - y_true) * (K.log(1 - y_pred) * c_FP + K.log(y_pred) * c_TN)
        return - K.mean(cost, axis=-1)
    return loss_function

In [None]:
def loss(y_true, y_pred, c_FP, c_TP, c_TN, c_FN):
    return y_true * (c_FN*np.log(y_pred) + c_TP*np.log(1-y_pred)) + \
            (1 - y_true) * (c_FP*np.log(1-y_pred) + c_TN*np.log(y_pred))

In [None]:
y_true = 0
y_pred = 0.89
c_FP = 200
c_TP = 5
c_TN = 0
c_FN = 50000
loss(y_true, y_pred, c_FP, c_TP, c_TN, c_FN)

In [None]:
y_input = create_y_input(y_train, cost).apply(float)

In [None]:
c_FP = 80
c_TP = 0
c_TN = 0
c_FN = cost

In [None]:
X_train_array=np.asarray(X_train).astype(np.float32)
y_train_array=np.asarray(y_train).astype(np.float32)

In [None]:
X_test_array=np.asarray(X_test).astype(np.float32)
y_test_array=np.asarray(y_test).astype(np.float32)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
def ann(indput_dim, dropout=0.2):
    model = Sequential([
    Dense(units=50, input_dim=indput_dim, activation='relu'),
    Dropout(dropout),
    Dense(units=25, activation='relu'),
    Dropout(dropout),
    Dense(15, activation='relu'),
    Dense(1, activation='sigmoid')])
    return model
clf = ann(indput_dim=X_train_array.shape[1], dropout=0.2)
clf.compile(optimizer='adam', loss=custom_loss(c_FP, c_TP,
            c_TN))
clf.fit(X_train_array, y_input, batch_size=128, epochs=100, verbose=1)
#clf.predict(X_test, verbose=1)

In [None]:
y_pred_train_ann = clf.predict(np.asarray(X_train).astype(np.float32), verbose=1)
confusion_matrix(np.asarray(y_train), np.round(y_pred_train_ann))

In [None]:
y_pred_test_ann = clf.predict(np.asarray(X_test).astype(np.float32), verbose=1)
confusion_matrix(np.asarray(y_test), np.round(y_pred_test_ann))

In [None]:
cost_mat_train = np.array([c_FP * np.ones(X_train.shape[0]), c_FN, 
                         c_TP * np.ones(X_train.shape[0]), 
                         c_TN * np.ones(X_train.shape[0])]).T

In [None]:
c_FP = 80
c_TP = 0
c_TN = 0
cost_mat_train = np.array([c_FP * np.ones(X_train.shape[0]), c_FN, 
                         c_TP * np.ones(X_train.shape[0]), 
                         c_TN * np.ones(X_train.shape[0])]).T

cost_mat_test = np.array([c_FP * np.ones(X_test.shape[0]), cost_test, 
                         c_TP * np.ones(X_test.shape[0]), 
                         c_TN * np.ones(X_test.shape[0])]).T

In [None]:
import eval_metrics

In [None]:
eval_metrics.evaluate('ANN Cost Sensitive', y_train_array, y_test_array, y_pred_train_ann, 
                          y_pred_test_ann, cost_mat_train, cost_mat_test)

# Cost-sensitive Logistic regression
Doesn't work very well due to bugs in Costcla (not maintained very well) and incompatibilities

In [None]:
from costcla.metrics import *
from costcla.models import CostSensitiveLogisticRegression, CostSensitiveRandomForestClassifier
from costcla.sampling import smote, cost_sampling

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled, cost_mat_resampled = smote(X_train.to_numpy(), y_train.to_numpy(), cost_mat=cost_matrix_train)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled, cost_mat_resampled = cost_sampling(X_train.to_numpy(), y_train.to_numpy(), cost_mat=cost_matrix_train, 
                                                                         method='OverSampling')

In [None]:
X_train_resampled.shape

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

We set administrative cost (for TP and FP) = 5 as in [Bahnsen et al.](https://ieeexplore.ieee.org/document/6784638)

In [None]:
def calculate_cost(augmented, investigation_fee):
    """
    augmented: train or test set augmented with dropped columns for claim_amount
    investigation_fee: float, fee for investigating fraud as % of transaction
    """
    TP_cost = [investigation_fee for i in range(len(augmented))]
    FP_cost = [investigation_fee for i in range(len(augmented))]
    FN_cost = augmented["claim_amount"]
    TN_cost = [0 for i in range(len(augmented))]
    cost_matrix = np.array([FP_cost, FN_cost, TP_cost, TN_cost]).T
    return cost_matrix

In [None]:
# design cost matrix
investigation_fee = 5.0
train_augmented = df.iloc[X_train.index].copy()
test_augmented = df.iloc[X_test.index].copy()

cost_matrix_train = calculate_cost(train_augmented, investigation_fee)
cost_matrix_test = calculate_cost(test_augmented, investigation_fee)

In [None]:
# train cost sensitive classifier
cs_clf = CostSensitiveLogisticRegression()
cs_clf.fit(X_train_scaled, y_train, cost_matrix_train)

In [None]:
# brf
savings_score(y_test, bclf.predict(X_test), cost_matrix_test)

In [None]:
print(cost_loss(y_test, clf.predict(X_test_scaled), cost_matrix_test))
print(savings_score(y_test, clf.predict(X_test_scaled), cost_matrix_test))

In [None]:
cost_matrix_train

In [None]:
print(cs_clf.predict_proba(X_test_scaled)[:,1])
print(cs_clf.predict(X_test_scaled))

In [None]:
print(cost_loss(y_test, cs_clf.predict(X_test_scaled), cost_matrix_test))
print(savings_score(y_test, cs_clf.predict(X_test_scaled), cost_matrix_test))

In [None]:
binary_classification_metrics(y_test, cs_clf.predict(X_test_scaled), cs_clf.predict_proba(X_test_scaled)[:,1])

# Cost-sensitive RF
Doesn't work very well due to bugs in Costcla and incompatibilities

In [None]:
X_train_scaled_resampled.shape

In [None]:
y_train_resampled.shape

In [None]:
cost_matrix_train.shape

In [None]:
# train cost sensitive classifier
cs_rf = CostSensitiveRandomForestClassifier(n_estimators=10000, max_features="sqrt", n_jobs=-1, combination="stacking_proba")
cs_rf.fit(X_train_scaled, y_train, cost_matrix_train)

In [None]:
print(cost_loss(y_test, cs_rf.predict(X_test_scaled), cost_matrix_test))
print(savings_score(y_test, cs_rf.predict(X_test_scaled), cost_matrix_test))

In [None]:
print(savings_score(y_train, cs_rf.predict(X_train_scaled), cost_matrix_train))

# Cost-sensitive sampling
Not actually used, just to try out some things

In [None]:
avg_fraud_cost = cost.iloc[np.where(y_train == 1)].mean()
investigation_cost = 20

In [None]:
c_FP = 5
c_TP = 5
c_TN = 0
c_FN = avg_fraud_cost

In [None]:
(c_FP - c_TN) / ((c_FP - c_TN) + (c_FN - c_TP))

In [None]:
avg_fraud_cost/investigation_cost

In [None]:
def clf_rule(predicted_prob):
    FN_cost = predicted_prob[:,0]*0 + predicted_prob[:,1]*avg_fraud_cost
    FP_cost = predicted_prob[:,0]*investigation_cost + predicted_prob[:,1]*investigation_cost
    return (FN_cost > FP_cost).astype(int)

In [None]:
clf.predict_proba(X_train)

In [None]:
confusion_matrix(y_test, clf_rule(clf.predict_proba(X_test)))

# Submission

In [None]:
submit_path = r"./test.csv"

In [None]:
submit_set = pd.read_csv(submit_path, sep=";", encoding="utf-8-sig")

In [None]:
# convert binary text variables into binary: {"Y":1, "N":0}
for i in ["claim_liable", "claim_police", "driver_injured"]:
    text_to_binary(i, "Y", "N", submit_set)
# {"P":1, "N":0}
text_to_binary("claim_alcohol", "P", "N", submit_set)
# {"car":1, "van":0}
text_to_binary("claim_vehicle_type", "car", "van", submit_set)
# {"M":1, "F":0}
text_to_binary("policy_holder_form", "M", "F", submit_set)
# {"B":1, "N":0}
text_to_binary("policy_holder_country", "B", "N", submit_set)
# make claim_lang binary (currently 1:Dutch, 2:Fr) -> 0: Dutch and 1: French
submit_set["claim_language"] = submit_set["claim_language"] - 1 

# add buckets for vehicle power
submit_set["vpower_buckets"] = pd.qcut(submit_set["claim_vehicle_power"], 5)

# add provinces based on postal code
postal_bins = [999, 1299, 1499, 1999, 2999, 3499, 3999, 4999, 5999, 6599, 6999, 7999, 8999, 9999]
postal_label = ["brussel", "waals_brabant", "vlaams_brabant", 'Antwerpen', 'vlaams_brabant', 'limburg', 
                'luik', 'namen', 'henegouwen', 'luxemburg', 'henegouwen', 'w-vlaanderen', 'o-vlaanderen']
submit_set["province"] = pd.cut(submit_set["claim_postal_code"], postal_bins, labels=postal_label, ordered=False)

# add feature that describes if policy holders postal code is same as claim postal code
submit_set["diff_postal_code"] = (submit_set["policy_holder_postal_code"] == submit_set["claim_postal_code"]).astype(float)

# get dummies for cat vars
submit_set = encode(ohe, submit_set, ['claim_cause', 'vpower_buckets', 'province'])
#df = encode_ph_postal_code(phpc_ohe, df)

# format date
YYYYMMDD_date_columns = ["claim_date_registered",
                         "claim_date_occured"]
for i in YYYYMMDD_date_columns:
    submit_set[i] = pd.to_datetime(submit_set[i], format="%Y%m%d")

# remove extreme value
submit_set["claim_vehicle_date_inuse"].replace(to_replace=270505.0, value= np.nan, inplace=True)

YYYYMM_columns = ["claim_vehicle_date_inuse", 
                  "policy_date_start",
                  "policy_date_next_expiry",
                  "policy_date_last_renewed"]
for i in YYYYMM_columns:
    submit_set[i] = pd.to_datetime(submit_set[i], format="%Y%m")

In [None]:
# Add the extra features just like we did for the training set
submit_set = add_extra_features(submit_set)

In [None]:
# Hide the claim_id column as index so that it's not used as covariate for the prediction, but we can recover
# it later as we need claim_id in the output .csv file
submit_set = submit_set.set_index('claim_id')
#df = df[X_train.drop(columns=["claim_amount"]).columns]
submit_set = submit_set[X_train.columns]


In [None]:
submit_set.isna().sum()

In [None]:
submit_set.to_csv('submit.csv')

In [None]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'Column2': "example_value",
            'claim_liable': "0",
            'claim_num_injured': "0",
            'claim_num_third_parties': "0",
            'claim_num_vehicles': "0",
            'claim_police': "0",
            'claim_language': "0",
            'claim_vehicle_type': "0",
            'policy_holder_form': "0",
            'policy_holder_country': "0",
            'policy_num_changes': "0",
            'policy_num_claims': "0",
            'policy_premium_100': "0",
            'policy_coverage_1000': "0",
            'diff_postal_code': "0",
            'claim_cause_animal': "0",
            'claim_cause_fire': "0",
            'claim_cause_other': "0",
            'claim_cause_theft': "0",
            'claim_cause_traffic accident': "0",
            'claim_cause_vandalism': "0",
            'claim_cause_weather': "0",
            'claim_cause_windows': "0",
            'vpower_buckets_(-0.001, 55.0]': "0",
            'vpower_buckets_(55.0, 66.0]': "0",
            'vpower_buckets_(66.0, 81.0]': "0",
            'vpower_buckets_(81.0, 100.0]': "0",
            'vpower_buckets_(100.0, 426.0]': "0",
            'vpower_buckets_nan': "0",
            'province_Antwerpen': "0",
            'province_brussel': "0",
            'province_henegouwen': "0",
            'province_limburg': "0",
            'province_luik': "0",
            'province_luxemburg': "0",
            'province_namen': "0",
            'province_o-vlaanderen': "0",
            'province_vlaams_brabant': "0",
            'province_w-vlaanderen': "0",
            'province_waals_brabant': "1",
            'claim_vehicle_id_count': "0",
            'policy_holder_id_count': "0",
            'driver_id_count': "0",
            'driver_vehicle_id_count': "0",
            'third_party_1_id_count': "0",
            'third_party_1_vehicle_id_count': "0",
            'blacklisted_expert_id': "false",
            'policy_holder_age': "0",
            'pct1': "0",
            'pct2': "0",
            'pct3': "0",
            'pct4': "0",
            'pct5': "0",
            'pct6': "0",
            'pct7': "0",
            'pct8': "0",
            'pct9': "0",
        },
    ],
}

body = str.encode(json.dumps(data))

url = 'http://be62a776-8b23-41b0-a6d2-0f61a6e64323.francecentral.azurecontainer.io/score'
api_key = '' # Replace this with the API key for the web service
headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
submit_set["claim_language"].fillna(train_lang_mode, inplace=True)
submit_set["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

# mean
submit_set["policy_premium_100"].fillna(train_premium_mean, inplace=True)
submit_set["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
submit_set["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
assert submit_set.isna().sum().sum() == 0

In [None]:
# While we don't have a model yet to predict claim_amount, set it to 0.0
# TODO: Replace this with the predicted response variable of the regression model on the submission set.
#df['claim_amount'] = model2.predict(df)

In [None]:
submit_scaled = scaler.transform(submit_set)

In [None]:
submit_not_scaled = submit_set.copy()

In [None]:
# final submission set initialization
submission = submit_set.reset_index()[['claim_id']]

In [None]:
# for logistic regression
submission["prediction"] = clf.predict_proba(submit_scaled)[:,1]

In [None]:
# for logistic regression with SMOTE
submission["prediction"] = clf_resampled.predict_proba(submit_scaled)[:,1]

In [None]:
# for rf with SMOTE
submission["prediction"] = rf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for balanced random forest
submission["prediction"] = bclf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["prediction"] = xgb_clf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for cost-sensitive xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["prediction"] = xgb_clf.predict_proba(np.asarray(submit_not_scaled))[:,1]

In [None]:
# for cost-sensitive ann
submission["prediction"] = clf.predict(np.asarray(submit_not_scaled).astype(np.float32), verbose=1)

In [None]:
# for randomsearch xgb
submit_not_scaled.iloc[:,-9:] = submit_not_scaled.iloc[:,-9:].astype(float)
submission["PROB"] = xgb_search.predict_proba(np.asarray(submit_not_scaled))[:,1]

In [None]:
submission.columns = ["ID", "PROB"]

In [None]:
submission.head()

In [None]:
submission.to_csv("submission_V0.53.csv", sep=',', index=False)