In [1]:
# Installing Packages
# !pip install statsmodels==0.13.5
#!pip install imblearn==0.0

# Multicollinearity
import statsmodels

# SMOTE
import imblearn

# Linear Algebra
import numpy as np

# Data processing, CSV File I/O
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Label Encoder
from sklearn.preprocessing import LabelEncoder

# Train Test 
from sklearn.model_selection import train_test_split

### REPLACE WITH YOUR OWN MODEL ####
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# StandardScaler
from sklearn.preprocessing import StandardScaler

# GridSearch
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Performance Metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

# Bagging
from sklearn.ensemble import BaggingClassifier

# Adaboost
from sklearn.ensemble import AdaBoostClassifier

# Reading the CSV

In [2]:
# Read from CSV file
df = pd.read_csv('insurance_claims.csv')
df.sample(5)

FileNotFoundError: ignored

# Exploratory Data Analysis

### Find Missing Values
Performing a quick exploration, using the pandas `.isnull()` function, we see there are no empty variables. However, we realised that there are some rows filled with "?" in some of their columns when we did a `df.sample(5)`. We quickly identified that there are 178 records in `collision_type`, 360 records in `property_damage` and 343 records in `police_report_available` that contains `?` in their columns.

In [None]:
df.isnull().sum() # Returns no null values
result = []
columns_head = []
for col in df.columns:
    if "?" in df[col].values:
        result.append(col)

print("Columns that have ? in their records:")
print(pd.Series(result),"\n")

print("No. of '?' in collision_type:", df[df["collision_type"] == "?"]["collision_type"].count())
print("No. of '?' in property_damage:", df[df["property_damage"] == "?"]["property_damage"].count())
print("No. of '?' in police_report_available:", df[df["police_report_available"] == "?"]["collision_type"].count())

### Plot Histogram
Plot a histogram to see all the numerical data in the dataset.

In [None]:
df.hist(figsize=(25,25))


### Identify number of unique values in each column
We will be dropping the columns that have high number of distinct values and/or columns that does not fit into our business context use case. `policy_number`, `policy_bind_date`, `insured_zip`, `incident_date`, `incident_location`, `_c39`.

In [None]:
num_unique = []
for col in df.columns:
    num_unique.append(df[col].nunique())

print(pd.Series(num_unique, index = df.columns))

# Feature Selection / Handling Categorical Variables

| Features Dropped | Reason |
| ----------- | ----------- |
| `_c39`, `incident_date` | Doesn't provide useful information |
| `policy_number`, `policy_bind_date`, `insured_zip`, `incident_location` | Too many distinct values in each columns (1000, 951, 995, 1000) respectively |
| `injury_claim`, `property_claim`, `vehicle_claim` | High correlation (> 0.81). All 3 combined is the same as `total_claim` |

In [None]:
# drop all identified columns
drop_columns = ["_c39", "incident_date", "policy_number", "policy_bind_date", "insured_zip", "incident_location"]
df.drop(columns=drop_columns,inplace=True)
df


In [None]:
# get object columns into list
df_col = list(df.select_dtypes(include=['object']).columns)

# creating instance of labelencoder
labelencoder = LabelEncoder()

# loop through df_col object list
for col in df_col:
    new_col = col + "_encode"
    # Assigning numerical values and storing in another column
    df[new_col] = labelencoder.fit_transform(df[col])


In [None]:
# drop not encoded columns
df.drop(columns=df_col, inplace=True)
df.shape

In [None]:
# drop correlated columns ('injury_claim', 'property_claim', 'vehicle_claim')
# correlation > 0.81 and the 3 cols == total claim
df.drop(columns=['injury_claim', 'property_claim', 'vehicle_claim'], inplace=True)
df.shape

### Visualising the Features
We will then take a look at all the columns as well as the columns we have encoded.

In [None]:
df

### Correlation with Target Variable: `fraud_reported_encoded`
We will then plot a correlation heatmap between all the various features and see how highly correlated they are with the target feature `fraud_reported_encoded`. We then pick select the features that have a correlation absolute value of `>= 0.05`, since the model shouldn't be affected by how negatively or positively correlated these features are with the target feature, so long it is highly correlated (above our set treshold of `>= 0.05`).


In [None]:
# plt.figure(figsize=(25,25))
corr=df.corr()
# sns.heatmap(data=corr,annot=True,fmt='.2g',linewidth=1)

In [None]:
fraud_corr_val = df.corr()["fraud_reported_encode"].sort_values().abs()
fraud_corr_val.between(0.05,1.0)

### Check for Multicollinearity

If the VIF is greater than 10, multicolinearity is likely present between the features and we should consider dropping the variable. Thankfully, none of our variables have a VIF value greater than 10.


In [None]:
# extract out our target feature and features for the model
features_selected = ["incident_severity_encode", "incident_state_encode", "incident_type_encode", "number_of_vehicles_involved", "umbrella_limit", "collision_type_encode", "total_claim_amount"]
X = df[features_selected]
y = df["fraud_reported_encode"]

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

df2 = df[features_selected]
vif = pd.DataFrame()
vif["features"] = features_selected
vif["vif_Factor"] = [variance_inflation_factor(df2.values, i) for i in range(df2.shape[1])]
vif

# Define Functions
For convenience sake, we will define various functions to take in the 'y_test' and 'y_pred' arguments to run the model, perform bagging and boosting, and return it's performance metrics.
`print metrics`

| Function | Description |
| ----------- | ----------- |
| `print_metrics` | Prints the Confusion Matrix, Accuracy, Precision, Recall and F1 Score. Will be used in `base_model`, `model_w_bagging`, `model_w_boosting` functions subsequently. |
| `base_model` | Runs the base model as it is |
| `model_w_bagging` | Runs the model with bagging |
| `model_w_boosting` | Runs the model with boosting |

In [None]:
def print_metrics(header,y_test,y_pred):
    print(50*"#")
    print(header)
    print(50*"#")
    print("Number of records: ", len(y_pred))
    print(50*"-")

    #Confusion Matrix
    conf_mat = confusion_matrix(y_test,y_pred)
    print("Confusion matrix:")
    print(conf_mat)

    #Accuracy
    acc_scr = accuracy_score(y_test, y_pred)
    print(50*"-")
    print("Accuracy Score: ",acc_scr.round(4))

    #F1-score
    f1_scr = f1_score(y_test, y_pred)
    print(50*"-")
    print("F1 Score:",f1_scr.round(4))
    
    #Precision
    prec_scr = precision_score(y_test, y_pred)
    print(50*"-")
    print("Precision Score:", prec_scr.round(4))

    #Recall
    rec_scr = recall_score(y_test, y_pred)
    print(50*"-")
    print("Recall Score:",rec_scr.round(4))    

    #Classification report
    print(50*"-")
    print("Classification Report:")
    print(classification_report(y_test,y_pred))
    print("\n")

    # True Negative, False Positive, False Negative, True Positive
    return [conf_mat[0][0],conf_mat[0][1],conf_mat[1][0],conf_mat[1][1],acc_scr.round(4),f1_scr.round(4),prec_scr.round(4),rec_scr.round(4)]

In [None]:
def base_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_predTrain = model.predict(X_train)
    y_pred = model.predict(X_test)

    print(50*"=", "Training Set", 50*"=")
    training = print_metrics("Base Model:", y_train, y_predTrain)

    print(50*"=", "Testing Set", 50*"=")
    testing = print_metrics("Base Model:", y_test, y_pred)
    return(training, testing)

In [None]:
def model_w_bagging(model, X_train, X_test, y_train, y_test):
    bag = BaggingClassifier(n_estimators=50, base_estimator=model, random_state=42)
    bag.fit(X_train, y_train)
    y_predTrain = bag.predict(X_train)
    y_pred = bag.predict(X_test)

    print(50*"=", "Training Set", 50*"=")
    training = print_metrics("Model with Bagging:", y_train, y_predTrain)

    print(50*"=", "Testing Set", 50*"=")
    testing = print_metrics("Model with Bagging", y_test, y_pred)
    return(training, testing)

In [None]:
def model_w_boosting(model, X_train, X_test, y_train, y_test, learning_rate, n_estimator):
    ada = AdaBoostClassifier(n_estimators=n_estimator, learning_rate=learning_rate,base_estimator=model, random_state=42)
    ada.fit(X_train, y_train)
    y_predTrain = ada.predict(X_train)
    y_pred = ada.predict(X_test)

    print(50*"=", "Training Set", 50*"=")
    training = print_metrics("Model with Boosting:", y_train, y_predTrain)

    print(50*"=", "Testing Set", 50*"=")
    testing = print_metrics("Model with Boosting",y_test,y_pred)
    return(training, testing)

# Model

### Split into Training and Testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling Continuous Variables
As the features `umbrella_limit` and `total_claim_amount` are both continuous variables containing huge range of values that may influence the model, we have to first scale these variables and pass it back to the `X_train` and `X_test` before using it in our logistic regression model.

Also, the scaling should only be performed after the data has been split into `training` and `testing`. As the test set is assuming the role of "freshly unseen data", it should not be accessible at the training stage. Therefore, we will be using `.fit_transform` for the training set and `.transform` for the testing set.

https://datascience.stackexchange.com/questions/54908/data-normalization-before-or-after-train-test-split

https://datascience.stackexchange.com/questions/12321/whats-the-difference-between-fit-and-fit-transform-in-scikit-learn-models

In [None]:
numerical_cols = ["umbrella_limit", "total_claim_amount"]
scaler = StandardScaler()

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

### Logistic Regression
We will then initialise our Logistic Regression model and fit our training and testing data in to the model. The function `base_model` will automatically train and test the model, before outputting the results.

In [None]:
### TO-DO: Replace with your model ###
model = LogisticRegression()

In [None]:
base = base_model(model, X_train, X_test, y_train, y_test)

### Logistic Regression with Bagging
We will now perform a <i>Logistic Regression with Bagging</i>. Judging from the metrics, we can tell that the this model slightly underperforms compared to <i>Logistic Regression</i> alone. With all the metrics performing worse, and considering that the cost of False Negatives are high for a Fraud Detection model, the base model would be a better choice.

In [None]:
bagging = model_w_bagging(model, X_train, X_test, y_train, y_test)

### Logistic Regression with Adaboost
We will now perform a <i>Logistic Regression with the Adaboost</i> ensemble method. Looking at the metrics alone, we were unable to come up with any conclusive results. Looking at the Classification report, we can see that the model is predicting no fraud cases in our `testing` data. However, among the 200 cases, only 145 of them are not fraud (True Negative), but 55 of them are actually fraud (False Negative).

In [None]:
boosting = model_w_boosting(model, X_train, X_test, y_train, y_test, 0.1, 50)

### AdaBoost with GridSearch Hyperparameter Tuning
To address the above issue, our team explored the idea of utilising GridSearch to perform hyperparameter tuning. Based on the return results of GridSearch, we can see that the ideal `learning rate` and `n_estimators` that returns the best F1 score is `2` and `50` respectively. With these values in mind, we ran the logistic regression model with boosting again with these parameter values. We can then see that the model returns the best F1 score. Overall, this model (model with boosting and tuned parameters )will be a better model as compared to the other models.

In [None]:
ada = AdaBoostClassifier(base_estimator=model, random_state=42)
grid = dict()
grid['n_estimators'] = [x for x in range(50, 160, 10)]
grid['learning_rate'] = [x for x in np.arange(0.1, 2.1, 0.05)]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define the grid search procedure, scoring = F1
grid_search = GridSearchCV(estimator=ada, param_grid=grid, n_jobs=-1, cv=cv, scoring='f1')

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# summarize all scores that were evaluated
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
boosting_tuned= model_w_boosting(model, X_train, X_test, y_train, y_test, 2, 50)

# Performing SMOTE Oversampling Technique
As our dataset is unbalanced, we will be performing SMOTE oversampling technique to increase the number of cases in our dataset in a more balanced way.

https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

### Split into Training and Testing sets

As an equally distributed classes in our `testing` set doesn't make sense as it does not replicate a real life situation of fraud detection, we will only be performing the SMOTE technique on our `training` set. The `testing` set is purely used only for testing of the performance of our model. 

https://www.kaggle.com/questions-and-answers/206597

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
# define pipeline
# random_state is removed to test the performace of the model, but can be added back after finding the best

over_4060 = SMOTE(sampling_strategy=0.4) 
# oversample the minority class to have 40 percent the number of examples of the majority class

over_5050 = SMOTE() 
# oversample the minority class to have 50 percent the number of examples of the majority class

# !!! Should not be undersampling because of insufficient data points
# under = RandomUnderSampler(sampling_strategy=0.4)
# random undersampling to reduce the number of examples in the majority class to have 50 percent more than the minority class

# steps = [('o', over), ('u', under)]
# pipeline = Pipeline(steps=steps)
# transform the dataset

X_train, y_train = over_5050.fit_resample(X_train, y_train)


### Scaling Continuous Variables 

In [None]:
numerical_cols = ["umbrella_limit", "total_claim_amount"]
scaler = StandardScaler()

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

### Model Evaluation

### Our Analysis of SMOTE
Overall, with the SMOTE technique, both the base model and our our model with bagging performs the same. 

However, the model with boosting performs the worst, where the F1 score is the lowest among the three. We then perform the same analysis on our model with boosting and tuned hyperparameters, and realised that it performs slightly better as compared to just the model with boosting alone. Overall, with SMOTE, both our base model and base model with bagging performs the best with an equal F1 score of `0.5854`.

Comparing between the use of SMOTE and without SMOTE, one conclusion we can draw here is that the tuning of hyperparameters without SMOTE may be a better model to use since we do not artificially create data to train the model, since the use of SMOTE may create potential issues of overfitting.


In [None]:
base_SMOTE = base_model(model, X_train, X_test, y_train, y_test)

In [None]:
bagging_SMOTE = model_w_bagging(model, X_train, X_test, y_train, y_test)

In [None]:
boosting_SMOTE = model_w_boosting(model, X_train, X_test, y_train, y_test, 0.1, 50)

In [None]:
boosting_tuned_SMOTE = model_w_boosting(model, X_train, X_test, y_train, y_test, 2, 50)

# Overall results
To make comparisons across the different combinations of model, ensemble methods and hyperparameters tuning, below are the results compiled into a dataframe for easy reference. The first table are the combinations without SMOTE, and the second table with SMOTE.

In [None]:
result = pd.DataFrame([base[0], base[1], bagging[0], bagging[1], boosting[0], boosting[1], boosting_tuned[0], boosting_tuned[1]])
result.columns = ["True Negative", "False Positive", "False Negative", "True Positive", "Accuracy", "F1 Score", "Precision", "Recall"]
result.index = ["Base Model (Train)", "Base Model (Test)", "Base Model with Bagging (Train)", "Base Model with Bagging (Test)", "Base Model with Boosting (Train)", "Base Model with Boosting (Test)", "Base Model with Tuned Boosting (Train)", "Base Model with Tuned Boosting (Test)"]
result

In [None]:
result_SMOTE = pd.DataFrame([base_SMOTE[0], base_SMOTE[1], bagging_SMOTE[0], bagging_SMOTE[1], boosting_SMOTE[0], boosting_SMOTE[1], boosting_tuned_SMOTE[0], boosting_tuned_SMOTE[1]])
result_SMOTE.columns = ["True Negative", "False Positive", "False Negative", "True Positive", "Accuracy", "F1 Score", "Precision", "Recall"]
result_SMOTE.index = ["SMOTE Base Model (Train)", "SMOTE Base Model (Test)", "SMOTE Base Model with Bagging (Train)", "SMOTE Base Model with Bagging (Test)", "SMOTE Base Model with Boosting (Train)", "SMOTE Base Model with Boosting (Test)", "SMOTE Base Model with Tuned Boosting (Train)", "SMOTE Base Model with Tuned Boosting (Test)"]
result_SMOTE

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9a8406e7-cda2-4ac1-b6a0-7bca60b93dc5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>