#FRAUD DETECTION DATASET

In [49]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [50]:
#Read training data file
trainfile = '/content/drive/MyDrive/Insurance Fraud - TRAIN-3000.csv'
trainData = pd.read_csv(trainfile)

#Read test data file
testfile = '/content/drive/MyDrive/Insurance Fraud -TEST-12900.csv'
testData = pd.read_csv(testfile)

#EXPLORATORY DATA ANALYSIS & PRE-PROCESSING

---



In [51]:
print(trainData.shape)
print(testData.shape)

(2999, 32)
(12918, 32)


In [52]:
print(trainData.isnull().sum())

MONTH                   0
WEEKOFMONTH             0
DAYOFWEEK               0
MAKE                    0
ACCIDENTAREA            0
DAYOFWEEKCLAIMED        0
MONTHCLAIMED            0
WEEKOFMONTHCLAIMED      0
SEX                     0
MARITALSTATUS           0
AGE                     0
FAULT                   0
POLICYTYPE              0
VEHICLECATEGORY         0
VEHICLEPRICE            0
REPNUMBER               0
DEDUCTIBLE              0
DRIVERRATING            0
DAYS_POLICY_ACCIDENT    0
DAYS_POLICY_CLAIM       0
PASTNUMBEROFCLAIMS      0
AGEOFVEHICLE            0
AGEOFPOLICYHOLDER       0
POLICEREPORTFILED       0
WITNESSPRESENT          0
AGENTTYPE               0
NUMBEROFSUPPLIMENTS     0
ADDRESSCHANGE_CLAIM     0
NUMBEROFCARS            0
YEAR                    0
BASEPOLICY              0
FRAUDFOUND              0
dtype: int64


In [53]:
categorical_columns = trainData.select_dtypes(include=['object', 'category']).columns
TrainCols = list(categorical_columns.values)
TrainCols.remove("FRAUDFOUND")
print(TrainCols)

['MONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY', 'VEHICLEPRICE', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'BASEPOLICY']


In [54]:
trainData.head()

Unnamed: 0,MONTH,WEEKOFMONTH,DAYOFWEEK,MAKE,ACCIDENTAREA,DAYOFWEEKCLAIMED,MONTHCLAIMED,WEEKOFMONTHCLAIMED,SEX,MARITALSTATUS,...,AGEOFPOLICYHOLDER,POLICEREPORTFILED,WITNESSPRESENT,AGENTTYPE,NUMBEROFSUPPLIMENTS,ADDRESSCHANGE_CLAIM,NUMBEROFCARS,YEAR,BASEPOLICY,FRAUDFOUND
0,Jul,3,Sunday,Honda,Rural,Wednesday,Jan,4,Male,Married,...,26_to_30,No,No,External,3_to_5,no_change,1-vehicle,1994,Collision,Yes
1,Nov,5,Monday,Mazda,Urban,Wednesday,Dec,1,Male,Single,...,over_65,No,No,External,none,no_change,1-vehicle,1994,All_Perils,Yes
2,Jan,1,Monday,Pontiac,Urban,Wednesday,Jan,1,Male,Married,...,41_to_50,No,No,External,none,under_6_months,1-vehicle,1994,All_Perils,Yes
3,Dec,1,Monday,Toyota,Rural,Tuesday,May,3,Male,Married,...,36_to_40,No,No,External,more_than_5,under_6_months,2-vehicles,1994,All_Perils,Yes
4,Dec,5,Wednesday,Pontiac,Urban,Wednesday,Jan,1,Male,Single,...,36_to_40,No,No,External,more_than_5,no_change,1-vehicle,1994,Collision,Yes


In [55]:

combinedData = pd.concat([trainData,testData], keys=[0,1])
combinedData["FRAUDFOUND"]=combinedData["FRAUDFOUND"].map({"Yes":1,"No":0})

combinedData = pd.get_dummies(combinedData,columns=TrainCols)

trainData = combinedData.xs(0)
testData = combinedData.xs(1)


In [56]:
trainData.head()

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR,FRAUDFOUND,MONTH_Apr,MONTH_Aug,...,ADDRESSCHANGE_CLAIM_no_change,ADDRESSCHANGE_CLAIM_under_6_months,NUMBEROFCARS_1-vehicle,NUMBEROFCARS_2-vehicles,NUMBEROFCARS_3_to_4,NUMBEROFCARS_5_to_8,NUMBEROFCARS_more_than_8,BASEPOLICY_All_Perils,BASEPOLICY_Collision,BASEPOLICY_Liability
0,3,4,21,4,400,4,1994,1,0,0,...,1,0,1,0,0,0,0,0,1,0
1,5,1,68,9,400,3,1994,1,0,0,...,1,0,1,0,0,0,0,1,0,0
2,1,1,50,8,400,2,1994,1,0,0,...,0,1,1,0,0,0,0,1,0,0
3,1,3,39,1,400,3,1994,1,0,0,...,0,1,0,1,0,0,0,1,0,0
4,5,1,43,1,400,4,1994,1,0,0,...,1,0,1,0,0,0,0,0,1,0


In [57]:
testData.head()

Unnamed: 0,WEEKOFMONTH,WEEKOFMONTHCLAIMED,AGE,REPNUMBER,DEDUCTIBLE,DRIVERRATING,YEAR,FRAUDFOUND,MONTH_Apr,MONTH_Aug,...,ADDRESSCHANGE_CLAIM_no_change,ADDRESSCHANGE_CLAIM_under_6_months,NUMBEROFCARS_1-vehicle,NUMBEROFCARS_2-vehicles,NUMBEROFCARS_3_to_4,NUMBEROFCARS_5_to_8,NUMBEROFCARS_more_than_8,BASEPOLICY_All_Perils,BASEPOLICY_Collision,BASEPOLICY_Liability
0,3,4,21,4,400,4,1994,1,0,0,...,1,0,1,0,0,0,0,0,1,0
1,5,1,68,9,400,3,1994,1,0,0,...,1,0,1,0,0,0,0,1,0,0
2,1,1,50,8,400,2,1994,1,0,0,...,0,1,1,0,0,0,0,1,0,0
3,1,3,39,1,400,3,1994,1,0,0,...,0,1,0,1,0,0,0,1,0,0
4,5,1,43,1,400,4,1994,1,0,0,...,1,0,1,0,0,0,0,0,1,0


In [58]:
Xtrain = trainData.drop("FRAUDFOUND", axis=1)
Ytrain = trainData["FRAUDFOUND"]
Xtest = testData.drop("FRAUDFOUND", axis=1)
Ytest = testData["FRAUDFOUND"]
print(Ytrain.value_counts())
print(Ytest.value_counts())


0    2600
1     399
Name: FRAUDFOUND, dtype: int64
0    12420
1      498
Name: FRAUDFOUND, dtype: int64


#BASIC DECISION TREE

In [59]:
dt = DecisionTreeClassifier(random_state = 7)
dt.fit(Xtrain, Ytrain)
dt_pred = dt.predict(Xtest)
#Model Accuracy
print("Accuracy:", accuracy_score(Ytest, dt_pred))
print("Precision:", precision_score(Ytest, dt_pred))
print("Recall:", recall_score(Ytest, dt_pred))
print("F1 Score:", f1_score(Ytest, dt_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, dt_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, dt_pred))

Accuracy: 0.8792382721783558
Precision: 0.22908163265306122
Recall: 0.9016064257028112
F1 Score: 0.3653376729048006
ROC-AUC Score: 0.8899739052829676
Confusion Matrix:
 [[10909  1511]
 [   49   449]]


In [60]:
parameters={'min_samples_split' : range(100,200,20),
            'max_depth': range(20,30,2),
            'criterion':['gini','entropy'],
            #'max_leaf_nodes' : range(5,30,5),
            'max_features' : range(20,30,2)
            }

# RANDOM SEARCH FOR DECISION TREE

In [61]:
#Hyperparameter tuning done for decision tree classifier
clf_random = RandomizedSearchCV(dt,parameters,n_iter=25, cv = 5, scoring = "roc_auc", verbose = 1)
clf_random.fit(Xtrain, Ytrain)
random_parm = clf_random.best_params_
best_score = clf_random.best_score_
print(f"Best Parameters: ",random_parm)
print(f"Best Score: ",best_score)
print('\n')


#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
dt = DecisionTreeClassifier(**random_parm, random_state = 7, class_weight = "balanced")
dt.fit(Xtrain, Ytrain)
rs_pred = dt.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, rs_pred))
print("Precision:", precision_score(Ytest, rs_pred))
print("Recall:", recall_score(Ytest, rs_pred))
print("F1 Score:", f1_score(Ytest, rs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rs_pred))
print('\n')

#get cross-validation report
dt_cv_score = cross_val_score(dt, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(dt_cv_score)
print('\n')
print("Mean ROC-AUC Score - Decision Tree: ",dt_cv_score.mean())


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters:  {'min_samples_split': 160, 'max_features': 22, 'max_depth': 28, 'criterion': 'gini'}
Best Score:  0.8065431475170399


Accuracy: 0.6745626257934665
Precision: 0.09576788830715532
Recall: 0.8815261044176707
F1 Score: 0.17276662731208187
ROC-AUC Score: 0.7738950972974021
Confusion Matrix:
 [[8275 4145]
 [  59  439]]


=== All ROC-AUC Scores ===
[0.88980769 0.92456731 0.86798077 0.83649038 0.81706731 0.66216346
 0.62100962 0.74413462 0.76932692 0.81706114]


Mean ROC-AUC Score - Decision Tree:  0.7949609220907298


# GRID SEARCH FOR DECISION TREE


In [62]:
#Hyperparameter tuning done for decision tree classifier
clf_grid = GridSearchCV(dt,parameters, cv = 5, scoring = "roc_auc", verbose = 1, n_jobs = -1)
clf_grid.fit(Xtrain, Ytrain)
grid_parm = clf_grid.best_params_
best_score = clf_grid.best_score_
print(f"Best Parameters: ",grid_parm)
print(f"Best Score: ",best_score)
print('\n')

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
dt = DecisionTreeClassifier(**grid_parm, random_state = 7, class_weight = "balanced")
dt.fit(Xtrain, Ytrain)
gs_pred = dt.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, gs_pred))
print("Precision:", precision_score(Ytest, gs_pred))
print("Recall:", recall_score(Ytest, gs_pred))
print("F1 Score:", f1_score(Ytest, gs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, gs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, gs_pred))
print('\n')

#get cross-validation report
dt_cv_score = cross_val_score(dt, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(dt_cv_score)
print('\n')
print("Mean ROC-AUC Score - Decision Tree: ",dt_cv_score.mean())


Fitting 5 folds for each of 250 candidates, totalling 1250 fits
Best Parameters:  {'criterion': 'entropy', 'max_depth': 20, 'max_features': 22, 'min_samples_split': 140}
Best Score:  0.831219936708861


Accuracy: 0.6506425143211023
Precision: 0.08955223880597014
Recall: 0.8795180722891566
F1 Score: 0.16255334941547595
ROC-AUC Score: 0.7604917253555283
Confusion Matrix:
 [[7967 4453]
 [  60  438]]


=== All ROC-AUC Scores ===
[0.84120192 0.92740385 0.81538462 0.87701923 0.82653846 0.71475962
 0.73870192 0.78086538 0.72350962 0.78890533]


Mean ROC-AUC Score - Decision Tree:  0.8034289940828401


#BASIC RANDOM FOREST

In [64]:
rf = RandomForestClassifier(random_state = 10,n_jobs = -1)
rf.fit(Xtrain, Ytrain)
rf_pred = dt.predict(Xtest)
#Model Accuracy
print("Accuracy:", accuracy_score(Ytest, rf_pred))
print("Precision:", precision_score(Ytest, rf_pred))
print("Recall:", recall_score(Ytest, rf_pred))
print("F1 Score:", f1_score(Ytest, rf_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rf_pred))

Accuracy: 0.6506425143211023
Precision: 0.08955223880597014
Recall: 0.8795180722891566
F1 Score: 0.16255334941547595
ROC-AUC Score: 0.7604917253555283
Confusion Matrix:
 [[7967 4453]
 [  60  438]]


In [65]:
rf_parameters={'min_samples_split' : range(100,250,10),
            'max_depth': range(5,30,5),
            'criterion':['gini','entropy'],
            'max_leaf_nodes' : range(5,30,5),
           # 'n_estimators' : [25,50, 75, 100]
            }

#RANDOM SEARCH FOR RANDOM FOREST

In [66]:
#Hyperparameter tuning done for random forest classifier
clf_random_rf = RandomizedSearchCV(rf,rf_parameters,n_iter=25, cv = 5, scoring = "roc_auc", verbose = 1)
clf_random_rf.fit(Xtrain, Ytrain)
random_rf_parm = clf_random_rf.best_params_
best_score = clf_random_rf.best_score_
print(f"Best Parameters: ",random_rf_parm)
print(f"Best Score: ",best_score)
print('\n')


#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
rfrs = RandomForestClassifier(**random_rf_parm, random_state = 7, class_weight = "balanced", n_jobs = -1 )
rfrs.fit(Xtrain, Ytrain)
rs_pred = rfrs.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, rs_pred))
print("Precision:", precision_score(Ytest, rs_pred))
print("Recall:", recall_score(Ytest, rs_pred))
print("F1 Score:", f1_score(Ytest, rs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, rs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, rs_pred))
print('\n')

#get cross-validation report
rf_cv_score = cross_val_score(rfrs, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All ROC-AUC Scores ===")
print(rf_cv_score)
print('\n')
print("Mean ROC-AUC Score - Random Forest: ",rf_cv_score.mean())


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Parameters:  {'min_samples_split': 150, 'max_leaf_nodes': 25, 'max_depth': 5, 'criterion': 'gini'}
Best Score:  0.838565664556962


Accuracy: 0.5959126799814213
Precision: 0.0838914346140289
Recall: 0.9558232931726908
F1 Score: 0.15424497731691508
ROC-AUC Score: 0.7686523873270862
Confusion Matrix:
 [[7222 5198]
 [  22  476]]


=== All ROC-AUC Scores ===
[0.89442308 0.92653846 0.92932692 0.90192308 0.8225     0.75346154
 0.71019231 0.78221154 0.80490385 0.81025641]


Mean ROC-AUC Score - Random Forest:  0.8335737179487179


#GRID SEARCH FOR RANDOM FOREST

In [67]:
#Hyperparameter tuning done for random forest classifier
clf_grid_rf = GridSearchCV(rf,rf_parameters, cv = 5, scoring = "roc_auc", verbose = 1, n_jobs = -1)
clf_grid_rf.fit(Xtrain, Ytrain)
grid_rf_parm = clf_grid_rf.best_params_
best_score = clf_grid_rf.best_score_
print(f"Best Parameters: ",grid_rf_parm)
print(f"Best Score: ",best_score)
print('\n')

#Using the parameters obtained from HyperParameterTuning in the RandomForestClassifier
rf = RandomForestClassifier(**grid_parm, random_state = 7, class_weight = "balanced",n_jobs=-1)
rf.fit(Xtrain, Ytrain)
gs_pred = rf.predict(Xtest)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("Accuracy:", accuracy_score(Ytest, gs_pred))
print("Precision:", precision_score(Ytest, gs_pred))
print("Recall:", recall_score(Ytest, gs_pred))
print("F1 Score:", f1_score(Ytest, gs_pred))
print("ROC-AUC Score:", roc_auc_score(Ytest, gs_pred))
print("Confusion Matrix:\n", confusion_matrix(Ytest, gs_pred))
print('\n')

#get cross-validation report
rf_cv_score = cross_val_score(rf, Xtrain, Ytrain, cv=10, scoring="roc_auc")
print("=== All Balanced_Accuracy Scores ===")
print(rf_cv_score)
print('\n')
print("Mean ROC-AUC Score - Random Forest: ",rf_cv_score.mean())


Fitting 5 folds for each of 750 candidates, totalling 3750 fits
Best Parameters:  {'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': 10, 'min_samples_split': 100}
Best Score:  0.8232557205452775


Accuracy: 0.6821489394643133
Precision: 0.10246804759806082
Recall: 0.9337349397590361
F1 Score: 0.1846703733121525
ROC-AUC Score: 0.8028980656927226
Confusion Matrix:
 [[8347 4073]
 [  33  465]]


=== All Balanced_Accuracy Scores ===
[0.89846154 0.93990385 0.92923077 0.91163462 0.83423077 0.75355769
 0.7125     0.73490385 0.79336538 0.83372781]


Mean ROC-AUC Score  Random Forest:  0.8341516272189349
