In [3]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
import xgboost as xgb  # Import XGBoost

In [4]:
# Import the Iris dataset as Pandas DataFrame
data = pd.read_csv('D:\\project\\MBA-Final-Project\\notebook\\data\\insuranceFraud_Dataset.csv')

In [5]:
# Show Top 5 Records
print(data.head())

   months_as_customer  age  policy_number policy_bind_date policy_state  \
0                 328   48         521585       10/17/2014           OH   
1                 228   42         342868        6/27/2006           IN   
2                 134   29         687698         9/6/2000           OH   
3                 256   41         227811        5/25/1990           IL   
4                 228   44         367455         6/6/2014           IL   

  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0    250/500               1000                1406.91               0   
1    250/500               2000                1197.22         5000000   
2    100/300               2000                1413.14         5000000   
3    250/500               2000                1415.74         6000000   
4   500/1000               1000                1583.91         6000000   

   insured_zip  ... witnesses police_report_available total_claim_amount  \
0       466132  ...         

In [27]:
# Preparing X and Y variables
x = data.drop(columns=['fraud_reported'], axis=1)
y = data['fraud_reported'].map({'N': 0, 'Y': 1}) # Convert string values to numeric values
y = y.astype('int') # Convert numeric values to integers

In [28]:
# Create Column Transformer with 2 types of transformers
num_features = x.select_dtypes(exclude="object").columns
cat_features = x.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)


In [29]:
x = preprocessor.fit_transform(x)

In [30]:
x.shape

(1000, 2174)

In [31]:
# Separate dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [32]:
# Create an Evaluate Function for classification
def evaluate_model_classification(true, predicted):
    accuracy = accuracy_score(true, predicted)
    report = classification_report(true, predicted)
    return accuracy, report


In [33]:
# Define classification models
classification_models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "SVM Classifier": SVC(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "XGBoost Classifier": xgb.XGBClassifier(),
}

In [34]:
# Model Training and Evaluation
model_list_classification = []
accuracy_list = []

In [35]:
for model_name, model in classification_models.items():
    model.fit(x_train, y_train)  # Train model




In [36]:
# Make predictions
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

In [37]:
# Evaluate Train and Test dataset
model_train_accuracy, model_train_report = evaluate_model_classification(y_train, y_train_pred)
model_test_accuracy, model_test_report = evaluate_model_classification(y_test, y_test_pred)
    

In [38]:
print(model_name)
model_list_classification.append(model_name)

XGBoost Classifier


In [39]:
print('Model performance for Training set')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print("- Classification Report:\n", model_train_report)
    
print('----------------------------------')
    
print('Model performance for Test set')
print("- Accuracy: {:.4f}".format(model_test_accuracy))
print("- Classification Report:\n", model_test_report)
accuracy_list.append(model_test_accuracy)
    
print('='*35)
print('\n')

Model performance for Training set
- Accuracy: 1.0000
- Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       608
           1       1.00      1.00      1.00       192

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800

----------------------------------
Model performance for Test set
- Accuracy: 0.7650
- Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       145
           1       0.60      0.45      0.52        55

    accuracy                           0.77       200
   macro avg       0.70      0.67      0.68       200
weighted avg       0.75      0.77      0.75       200



