In [17]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [18]:
trainfile = r'/gdrive/My Drive/CIS508/Assignment-2/Insurance Fraud - TRAIN-3000(1).csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS508/Assignment-2/Insurance Fraud -TEST-12900(1).csv'
testData = pd.read_csv(testfile) #creates a dataframe

#print sizes (shape) of datasets
print(trainData.shape)
print(testData.shape)

print(trainData.head())
print(testData.head())

(2999, 32)
(12918, 32)
  MONTH  WEEKOFMONTH  DAYOFWEEK  ...  YEAR  BASEPOLICY FRAUDFOUND
0   Jul            3     Sunday  ...  1994   Collision        Yes
1   Nov            5     Monday  ...  1994  All_Perils        Yes
2   Jan            1     Monday  ...  1994  All_Perils        Yes
3   Dec            1     Monday  ...  1994  All_Perils        Yes
4   Dec            5  Wednesday  ...  1994   Collision        Yes

[5 rows x 32 columns]
  MONTH  WEEKOFMONTH  DAYOFWEEK  ...  YEAR  BASEPOLICY FRAUDFOUND
0   Jul            3     Sunday  ...  1994   Collision        Yes
1   Nov            5     Monday  ...  1994  All_Perils        Yes
2   Jan            1     Monday  ...  1994  All_Perils        Yes
3   Dec            1     Monday  ...  1994  All_Perils        Yes
4   Dec            5  Wednesday  ...  1994   Collision        Yes

[5 rows x 32 columns]


In [19]:
#Copy Train and Test data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

print(trainData_Copy.head())
print(testData_Copy.head())

  MONTH  WEEKOFMONTH  DAYOFWEEK  ... NUMBEROFCARS  YEAR  BASEPOLICY
0   Jul            3     Sunday  ...    1-vehicle  1994   Collision
1   Nov            5     Monday  ...    1-vehicle  1994  All_Perils
2   Jan            1     Monday  ...    1-vehicle  1994  All_Perils
3   Dec            1     Monday  ...   2-vehicles  1994  All_Perils
4   Dec            5  Wednesday  ...    1-vehicle  1994   Collision

[5 rows x 31 columns]
  MONTH  WEEKOFMONTH  DAYOFWEEK  ... NUMBEROFCARS  YEAR  BASEPOLICY
0   Jul            3     Sunday  ...    1-vehicle  1994   Collision
1   Nov            5     Monday  ...    1-vehicle  1994  All_Perils
2   Jan            1     Monday  ...    1-vehicle  1994  All_Perils
3   Dec            1     Monday  ...   2-vehicles  1994  All_Perils
4   Dec            5  Wednesday  ...    1-vehicle  1994   Collision

[5 rows x 31 columns]


In [20]:
#List of Categorical Features
categoricalFeatures = []
for col in trainData_Copy.columns:
  if col in testData_Copy.columns:
    categoricalFeatures.append(col)
#Removing age from categorical features as it does not add value
categoricalFeatures.remove("AGE")
print (categoricalFeatures)

['MONTH', 'WEEKOFMONTH', 'DAYOFWEEK', 'MAKE', 'ACCIDENTAREA', 'DAYOFWEEKCLAIMED', 'MONTHCLAIMED', 'WEEKOFMONTHCLAIMED', 'SEX', 'MARITALSTATUS', 'FAULT', 'POLICYTYPE', 'VEHICLECATEGORY', 'VEHICLEPRICE', 'REPNUMBER', 'DEDUCTIBLE', 'DRIVERRATING', 'DAYS_POLICY_ACCIDENT', 'DAYS_POLICY_CLAIM', 'PASTNUMBEROFCLAIMS', 'AGEOFVEHICLE', 'AGEOFPOLICYHOLDER', 'POLICEREPORTFILED', 'WITNESSPRESENT', 'AGENTTYPE', 'NUMBEROFSUPPLIMENTS', 'ADDRESSCHANGE_CLAIM', 'NUMBEROFCARS', 'YEAR', 'BASEPOLICY']


In [21]:
#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

print('----------------------------------------------------------------')
print(X_train.head())
print(X_test.head())

----------------------------------------------------------------
   AGE  MONTH_Apr  ...  BASEPOLICY_Collision  BASEPOLICY_Liability
0   21          0  ...                     1                     0
1   68          0  ...                     0                     0
2   50          0  ...                     0                     0
3   39          0  ...                     0                     0
4   43          0  ...                     1                     0

[5 rows x 175 columns]
   AGE  MONTH_Apr  ...  BASEPOLICY_Collision  BASEPOLICY_Liability
0   21          0  ...                     1                     0
1   68          0  ...                     0                     0
2   50          0  ...                     0                     0
3   39          0  ...                     0                     0
4   43          0  ...                     1                     0

[5 rows x 175 columns]


In [22]:
#Extracting target column for training and test
y_train = trainData.iloc[:,-1]
print(y_train.head())
y_test = testData.iloc[:,-1]
print(y_test.head())




0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
Name: FRAUDFOUND, dtype: object
0    Yes
1    Yes
2    Yes
3    Yes
4    Yes
Name: FRAUDFOUND, dtype: object


In [23]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (testing) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))



accuracy Score (testing) for Decision Tree:0.890386
Confusion Matrix for Decision Tree
[[11062  1358]
 [   58   440]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.99      0.89      0.94     12420
         Yes       0.24      0.88      0.38       498

    accuracy                           0.89     12918
   macro avg       0.62      0.89      0.66     12918
weighted avg       0.97      0.89      0.92     12918



In [24]:
#Hyperparameter tuning done for decision tree classifier
#RANDOM SEARCH--------------------------------------------
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,200,10),
            'max_depth': range(5,40,5),
            'criterion':['gini','entropy']
            }
clf_random = RandomizedSearchCV(clf,parameters,n_iter=20,cv=10)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#GRID SEARCH----------------------------------------
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 90, 'max_depth': 30, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 90}


In [25]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("accuracy Score (testing) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after Random hypertuning for Decision Tree (Testing)")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report for Random hypertesting (Testing)  ===")
print(classification_report(y_test,clf_predict))
print("Confusion Matrix after Grid hypertuning for Decision Tree (Testing)")
print(confusion_matrix(y_test,clfr_predict))
print("=== Classification Report for Grid hypertesting (Testing)  ===")
print(classification_report(y_test,clfr_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=15, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')




accuracy Score (testing) after hypertuning randomized search for Decision Tree:0.882877
accuracy Score (testing) after hypertuning grid search for Decision Tree:0.882877
Confusion Matrix after Random hypertuning for Decision Tree (Testing)
[[11270  1150]
 [  363   135]]
=== Classification Report for Random hypertesting (Testing)  ===
              precision    recall  f1-score   support

          No       0.97      0.91      0.94     12420
         Yes       0.11      0.27      0.15       498

    accuracy                           0.88     12918
   macro avg       0.54      0.59      0.54     12918
weighted avg       0.94      0.88      0.91     12918

Confusion Matrix after Grid hypertuning for Decision Tree (Testing)
[[11270  1150]
 [  363   135]]
=== Classification Report for Grid hypertesting (Testing)  ===
              precision    recall  f1-score   support

          No       0.97      0.91      0.94     12420
         Yes       0.11      0.27      0.15       498

    accurac

In [34]:
#Normal randomforest
rand_parameters={'min_samples_leaf' : range(10,200,10),
                 'max_depth': range(1,20,2),
                 'max_features':range(10,100,10),
                 'n_estimators':[20,30,40]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (testing) for Default Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Default Random Forest (testing):")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report for Default Random Forest(Testing)  ===")
print(classification_report(y_test,rfc_predict))





accuracy Score (testing) for Default Random Forest:0.957966
Confusion Matrix for Default Random Forest (testing):
[[11988   432]
 [  111   387]]
=== Classification Report for Default Random Forest(Testing)  ===
              precision    recall  f1-score   support

          No       0.99      0.97      0.98     12420
         Yes       0.47      0.78      0.59       498

    accuracy                           0.96     12918
   macro avg       0.73      0.87      0.78     12918
weighted avg       0.97      0.96      0.96     12918



In [35]:
#RANDOMIZED SEARCH for Random Forest----------------------------------------
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=20,cv=10)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (testing) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')


{'n_estimators': 40, 'min_samples_leaf': 40, 'max_features': 30, 'max_depth': 13}
accuracy Score (testing) after hypertuning for Random Forest:0.937761
Confusion Matrix after hypertuning for Random Forest:
[[12012   408]
 [  396   102]]
=== Classification Report ===
              precision    recall  f1-score   support

          No       0.97      0.97      0.97     12420
         Yes       0.20      0.20      0.20       498

    accuracy                           0.94     12918
   macro avg       0.58      0.59      0.58     12918
weighted avg       0.94      0.94      0.94     12918

[0.867      0.878      0.87387387]




In [36]:
#GRID SEARCH for RANDOM FOREST----------------------------------------
rfc_grid = GridSearchCV(rfc,rand_parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc=rfc_grid.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (testing) after Grid hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after Grid hypertuning for Random Forest (Testing):")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report after Grid hypertuning for Random Forest (Testing) ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')

{'max_depth': 5, 'max_features': 90, 'min_samples_leaf': 30, 'n_estimators': 40}
accuracy Score (testing) after Grid hypertuning for Random Forest:0.925066
Confusion Matrix after Grid hypertuning for Random Forest (Testing):
[[11824   596]
 [  372   126]]
=== Classification Report after Grid hypertuning for Random Forest (Testing) ===
              precision    recall  f1-score   support

          No       0.97      0.95      0.96     12420
         Yes       0.17      0.25      0.21       498

    accuracy                           0.93     12918
   macro avg       0.57      0.60      0.58     12918
weighted avg       0.94      0.93      0.93     12918

[0.866      0.886      0.87687688]


