# TO DO
* feature engineering
* cost threshold chaning

# Information tab

* For more info on SMOTE see [here](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, roc_auc_score, plot_roc_curve, plot_confusion_matrix

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

#import lightgbm as xgb

from features import update_dataset_features, text_to_binary, add_extra_features, encode_claim_cause

pd.set_option("display.max_columns",500)
plt.style.use('ggplot')

In [None]:
data_path = r"./train.csv"

In [None]:
df = pd.read_csv(data_path, sep=";", encoding="utf-8-sig")

In [None]:
#df["claim_date_occured"] = pd.to_datetime(df["claim_date_occured"], format="%Y%m%d")
#min(df["claim_date_occured"].dt.year - df["policy_holder_year_birth"])

# Exploratory Data Analysis (EDA)

In [None]:
pd.crosstab(df["fraud"], df["claim_vehicle_brand"], normalize=True)

In [None]:
for i in df.columns:
    print(i)
    try:
        df[i].plot(kind="hist")
        plt.show()
        print(df[i].describe())
    except TypeError:
        values = df[i].value_counts()
        if len(values) < 10:
            values.plot(kind="bar")
            plt.show()
            print(df[i].describe())
        else:
            print('*******too many values to plot*******************')
            print(df[i].describe())
    print('*************************************************************************')

In [None]:
# read data in cell below to speed up and skip this step
df, claim_cause_ohe = update_dataset_features(df)

In [None]:
df.isna().sum()

# Train test split + prep

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df["fraud"], test_size=.2, random_state=96)

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
train_lang_mode = X_train["claim_language"].mode()[0]
train_vtype_mode = X_train["claim_vehicle_type"].mode()[0]
X_train["claim_language"].fillna(train_lang_mode, inplace=True)
X_train["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

# mean
train_premium_mean = X_train["policy_premium_100"].mean()
train_coverage_mean = X_train["policy_coverage_1000"].mean()
#train_policy_holder_mean_age = X_train["policy_holder_age"].mean()
X_train["policy_premium_100"].fillna(train_premium_mean, inplace=True)
X_train["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
# X_train["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
# impute remaining missing values with mode or mean from train set on test set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
X_test["claim_language"].fillna(train_lang_mode, inplace=True)
X_test["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

# mean
X_test["policy_premium_100"].fillna(train_premium_mean, inplace=True)
X_test["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
# X_test["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

# Base Logistic Regression model - off the shelf

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

In [None]:
X_test_scaled = scaler.transform(X_test)
plot_roc_curve(clf, X_test_scaled, y_test)

In [None]:
plot_confusion_matrix(clf, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Logistic Regression model - SMOTE

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
over = SMOTE(sampling_strategy=0.05)
under = RandomUnderSampler(sampling_strategy=0.3)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_scaled_resampled, y_train_resampled = pipeline.fit_resample(X_train_scaled, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
clf_resampled = LogisticRegression(max_iter=500)
clf_resampled.fit(X_train_scaled_resampled, y_train_resampled)

In [None]:
X_test_scaled = scaler.transform(X_test)
plot_roc_curve(clf_resampled, X_test_scaled, y_test)

In [None]:
plot_confusion_matrix(clf_resampled, X_test_scaled, y_test)
plt.grid(False)
plt.show()

# Random Forest - SMOTE

In [None]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.3)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=9, n_jobs=2)
rf.fit(X_train_resampled, y_train_resampled) # X_train_resampled is defined below

In [None]:
rf.feature_importances_

In [None]:
X_test_scaled = scaler.transform(X_test)
plot_roc_curve(rf, X_test_scaled, y_test)

In [None]:
# check variable importance

In [None]:
plot_confusion_matrix(rf, X_test, y_test)
plt.grid(False)
plt.show()

# Balanced random forest

See [here](https://imbalanced-learn.org/dev/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html#imblearn.ensemble.BalancedRandomForestClassifier)

In [None]:
over = SMOTE(sampling_strategy=0.1)
steps = [('o', over)]
pipeline = Pipeline(steps=steps)

In [None]:
# transform the dataset
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
# before SMOTE and undersampling
neg_length = len(y_train) - y_train.sum()
pos_length = y_train.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")
print("*********************************************************")
# after SMOTE and undersampling
neg_length = len(y_train_resampled) - y_train_resampled.sum()
pos_length = y_train_resampled.sum()
print(f"Majority class (0): {neg_length}")
print(f"Minority class (1): {pos_length}")
print(f"ratio: {np.round(pos_length/neg_length,5)*100}%")

In [None]:
bclf = BalancedRandomForestClassifier(n_estimators=1000, max_depth=6, random_state=9, n_jobs=2)

In [None]:
bclf.fit(X_train_resampled, y_train_resampled)

In [None]:
plot_roc_curve(bclf, X_test, y_test)

In [None]:
plot_confusion_matrix(bclf, X_test, y_test)
plt.grid(False)
plt.show()

# Submission

In [None]:
submit_path = r"./test.csv"

In [None]:
submit_set = pd.read_csv(submit_path, sep=";", encoding="utf-8-sig")

In [None]:
df = submit_set
# convert binary text variables into binary: {"Y":1, "N":0}
for i in ["claim_liable", "claim_police", "driver_injured"]:
    text_to_binary(i, "Y", "N", df)
# {"P":1, "N":0}
text_to_binary("claim_alcohol", "P", "N", df)
# {"car":1, "van":0}
text_to_binary("claim_vehicle_type", "car", "van", df)
# {"M":1, "F":0}
text_to_binary("policy_holder_form", "M", "F", df)
# {"B":1, "N":0}
text_to_binary("policy_holder_country", "B", "N", df)
# make claim_lang binary (currently 1:Dutch, 2:Fr) -> 0: Dutch and 1: French
df["claim_language"] = df["claim_language"] - 1 

# get dummies for cat vars
df = encode_claim_cause(claim_cause_ohe, df)
#df = encode_ph_postal_code(phpc_ohe, df)

# format date
YYYYMMDD_date_columns = ["claim_date_registered",
                         "claim_date_occured"]
for i in YYYYMMDD_date_columns:
    df[i] = pd.to_datetime(df[i], format="%Y%m%d")

# remove extreme value
df["claim_vehicle_date_inuse"].replace(to_replace=270505.0, value= np.nan, inplace=True)

YYYYMM_columns = ["claim_vehicle_date_inuse", 
                  "policy_date_start",
                  "policy_date_next_expiry",
                  "policy_date_last_renewed"]
for i in YYYYMM_columns:
    df[i] = pd.to_datetime(df[i], format="%Y%m")

In [None]:
# Add the extra features just like we did for the training set
df = add_extra_features(df)

In [None]:
# Hide the claim_id column as index so that it's not used as covariate for the prediction, but we can recover
# it later as we need claim_id in the output .csv file
df = df.set_index('claim_id')
df = df[X_train.columns]

In [None]:
df.isna().sum()

In [None]:
# impute remaining missing values with mode or mean on train set
# here it could potentially make sense to include a third category (i.e. missing), although this would be a small cat

# mode
df["claim_language"].fillna(train_lang_mode, inplace=True)
df["claim_vehicle_type"].fillna(train_vtype_mode, inplace=True)

# mean
df["policy_premium_100"].fillna(train_premium_mean, inplace=True)
df["policy_coverage_1000"].fillna(train_coverage_mean, inplace=True)
# df["policy_holder_age"].fillna(train_policy_holder_mean_age, inplace=True)

In [None]:
assert df.isna().sum().sum() == 0

In [None]:
submit_scaled = scaler.transform(df)

In [None]:
submit_not_scaled = df.copy()

In [None]:
# final submission set initialization
submission = df.reset_index()[['claim_id']]

In [None]:
# for logistic regression
submission["prediction"] = clf.predict_proba(submit_scaled)[:,1]

In [None]:
# for logistic regression with SMOTE
submission["prediction"] = clf_resampled.predict_proba(submit_scaled)[:,1]

In [None]:
# for rf with SMOTE
submission["prediction"] = rf.predict_proba(submit_not_scaled)[:,1]

In [None]:
# for balanced random forest
submission["prediction"] = bclf.predict_proba(submit_not_scaled)[:,1]

In [None]:
submission.columns

In [None]:
submission.columns = ["ID", "PROB"]

In [None]:
submission.to_csv("submission_V0.12.csv", sep=',', index=False)