In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 

from ydata_profiling import ProfileReport 

import warnings
warnings.filterwarnings('ignore') 

plt.style.use('ggplot')

df = pd.read_csv('insurance_claims.csv')

In [3]:
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [82]:
# run profilereport() to for eda
ProfileReport(wdf)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Correlation
* age is highly overall correlated w/months_as_customer  
* total_claim_amount is highly overall correlated w/:
    * injury_claim  
    * property_claim  
    * vehicle_claim  
    * incident_type  
    * collision_type  
* number_of_vehicles_involved is highly overall correlated w/incident_type  
* fraud_reported is highly overall correlated with incident_severity 

In [5]:
# check for missing values
df.isna().sum()

months_as_customer                0
age                               0
policy_number                     0
policy_bind_date                  0
policy_state                      0
policy_csl                        0
policy_deductable                 0
policy_annual_premium             0
umbrella_limit                    0
insured_zip                       0
insured_sex                       0
insured_education_level           0
insured_occupation                0
insured_hobbies                   0
insured_relationship              0
capital-gains                     0
capital-loss                      0
incident_date                     0
incident_type                     0
collision_type                    0
incident_severity                 0
authorities_contacted             0
incident_state                    0
incident_city                     0
incident_location                 0
incident_hour_of_the_day          0
number_of_vehicles_involved       0
property_damage             

In [6]:
# list dataframe columns to pick and choose what i move forward with
df.columns

Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported', '_c39'],
      dtype='object')

In [11]:
# create my working dataframe
wdf = df[[
    'age', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'capital-gains', 'capital-loss',
    'authorities_contacted', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 
    'witnesses', 'police_report_available', 'total_claim_amount', 'fraud_reported'    
]]

In [12]:
wdf.head()

Unnamed: 0,age,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_occupation,capital-gains,capital-loss,authorities_contacted,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,police_report_available,total_claim_amount,fraud_reported
0,48,1000,1406.91,0,MALE,MD,craft-repair,53300,0,Police,5,1,2,YES,71610,Y
1,42,2000,1197.22,5000000,MALE,MD,machine-op-inspct,0,0,Police,8,1,0,,5070,Y
2,29,2000,1413.14,5000000,FEMALE,PhD,sales,35100,0,Police,7,3,3,NO,34650,N
3,41,2000,1415.74,6000000,FEMALE,PhD,armed-forces,48900,-62400,Police,5,1,2,NO,63400,Y
4,44,1000,1583.91,6000000,MALE,Associate,sales,66000,-46000,,20,1,1,NO,6500,N


In [9]:
# get rid of question marks
df.replace('?', np.nan, inplace=True)

In [10]:
wdf.describe()

Unnamed: 0,age,policy_deductable,policy_annual_premium,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,38.948,1136.0,1256.40615,1101000.0,25126.1,-26793.7,11.644,1.839,1.487,52761.94
std,9.140287,611.864673,244.167395,2297407.0,27872.187708,28104.096686,6.951373,1.01888,1.111335,26401.53319
min,19.0,500.0,433.33,-1000000.0,0.0,-111100.0,0.0,1.0,0.0,100.0
25%,32.0,500.0,1089.6075,0.0,0.0,-51500.0,6.0,1.0,1.0,41812.5
50%,38.0,1000.0,1257.2,0.0,0.0,-23250.0,12.0,1.0,1.0,58055.0
75%,44.0,2000.0,1415.695,0.0,51025.0,0.0,17.0,3.0,2.0,70592.5
max,64.0,2000.0,2047.59,10000000.0,100500.0,0.0,23.0,4.0,3.0,114920.0


In [27]:
# create X and y
X = wdf.drop('fraud_reported', axis=1) 
y = wdf['fraud_reported']

In [28]:
# create variable for categorical data
cat_df = X.select_dtypes(include=['object'])

In [29]:
# print unique values for each feature
for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")

insured_sex: 
['MALE' 'FEMALE']

insured_education_level: 
['MD' 'PhD' 'Associate' 'Masters' 'High School' 'College' 'JD']

insured_occupation: 
['craft-repair' 'machine-op-inspct' 'sales' 'armed-forces' 'tech-support'
 'prof-specialty' 'other-service' 'priv-house-serv' 'exec-managerial'
 'protective-serv' 'transport-moving' 'handlers-cleaners' 'adm-clerical'
 'farming-fishing']

authorities_contacted: 
['Police' 'None' 'Fire' 'Other' 'Ambulance']

police_report_available: 
['YES' nan 'NO']



In [30]:
# convert the categorical data 
cat_df = pd.get_dummies(cat_df, drop_first=True)

In [31]:
# create variable for numerical data
num_df = X.select_dtypes(include=['int64'])

In [32]:
# concatenate cat_df and num_df for X
X = pd.concat([num_df, cat_df], axis=1)

In [33]:
X.head()

Unnamed: 0,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount,insured_sex_MALE,...,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police,police_report_available_YES
0,48,1000,0,53300,0,5,1,2,71610,1,...,0,0,0,0,0,0,0,0,1,1
1,42,2000,5000000,0,0,8,1,0,5070,1,...,0,0,0,0,0,0,0,0,1,0
2,29,2000,5000000,35100,0,7,3,3,34650,0,...,0,0,1,0,0,0,0,0,1,0
3,41,2000,6000000,48900,-62400,5,1,2,63400,0,...,0,0,0,0,0,0,0,0,1,0
4,44,1000,6000000,66000,-46000,20,1,1,6500,1,...,0,0,1,0,0,0,1,0,0,0


In [34]:
# train/test/split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [35]:
X_train.head()

Unnamed: 0,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount,insured_sex_MALE,...,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police,police_report_available_YES
749,38,2000,0,0,-45100,5,1,1,90530,0,...,0,0,0,1,0,0,0,0,0,0
834,32,2000,0,43100,-31900,21,1,1,5600,1,...,0,0,0,0,0,0,0,0,1,1
904,27,500,0,56700,-49300,22,3,1,75690,0,...,0,0,0,1,0,0,0,1,0,1
893,33,500,0,0,0,8,1,0,8970,1,...,0,0,0,0,1,0,1,0,0,0
716,28,500,0,0,0,3,1,2,49900,1,...,1,0,0,0,0,0,0,1,0,1


In [36]:
X_train.columns

Index(['age', 'policy_deductable', 'umbrella_limit', 'capital-gains',
       'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'witnesses', 'total_claim_amount',
       'insured_sex_MALE', 'insured_education_level_College',
       'insured_education_level_High School', 'insured_education_level_JD',
       'insured_education_level_MD', 'insured_education_level_Masters',
       'insured_education_level_PhD', 'insured_occupation_armed-forces',
       'insured_occupation_craft-repair', 'insured_occupation_exec-managerial',
       'insured_occupation_farming-fishing',
       'insured_occupation_handlers-cleaners',
       'insured_occupation_machine-op-inspct',
       'insured_occupation_other-service',
       'insured_occupation_priv-house-serv',
       'insured_occupation_prof-specialty',
       'insured_occupation_protective-serv', 'insured_occupation_sales',
       'insured_occupation_tech-support',
       'insured_occupation_transport-moving', 'authoritie

In [37]:
num_df = X_train[['age', 'policy_deductable', 'umbrella_limit', 'capital-gains',
       'capital-loss', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'witnesses', 'total_claim_amount']]

In [38]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()  
scaled_data = scaler.fit_transform(num_df)

In [39]:
scaled_num_df = pd.DataFrame(data=scaled_data, columns=num_df.columns, index=X_train.index)  
scaled_num_df.head()

Unnamed: 0,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount
749,-0.108587,1.418652,-0.479324,-0.888354,-0.655485,-0.973523,-0.854167,-0.422652,1.453522
834,-0.756866,1.418652,-0.479324,0.649358,-0.184144,1.336225,-0.854167,-0.422652,-1.83367
904,-1.297099,-1.041439,-0.479324,1.134575,-0.805456,1.480584,1.098214,-0.422652,0.879144
893,-0.64882,-1.041439,-0.479324,-0.888354,0.954929,-0.540445,-0.854167,-1.321913,-1.703235
716,-1.189053,-1.041439,-0.479324,-0.888354,0.954929,-1.262241,-0.854167,0.476608,-0.119051


In [40]:
X_train.drop(columns=scaled_num_df.columns, inplace=True)

In [41]:
X_train = pd.concat([scaled_num_df, X_train], axis=1)

In [42]:
X_train.head()

Unnamed: 0,age,policy_deductable,umbrella_limit,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,witnesses,total_claim_amount,insured_sex_MALE,...,insured_occupation_prof-specialty,insured_occupation_protective-serv,insured_occupation_sales,insured_occupation_tech-support,insured_occupation_transport-moving,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Other,authorities_contacted_Police,police_report_available_YES
749,-0.108587,1.418652,-0.479324,-0.888354,-0.655485,-0.973523,-0.854167,-0.422652,1.453522,0,...,0,0,0,1,0,0,0,0,0,0
834,-0.756866,1.418652,-0.479324,0.649358,-0.184144,1.336225,-0.854167,-0.422652,-1.83367,1,...,0,0,0,0,0,0,0,0,1,1
904,-1.297099,-1.041439,-0.479324,1.134575,-0.805456,1.480584,1.098214,-0.422652,0.879144,0,...,0,0,0,1,0,0,0,1,0,1
893,-0.64882,-1.041439,-0.479324,-0.888354,0.954929,-0.540445,-0.854167,-1.321913,-1.703235,1,...,0,0,0,0,1,0,1,0,0,0
716,-1.189053,-1.041439,-0.479324,-0.888354,0.954929,-1.262241,-0.854167,0.476608,-0.119051,1,...,1,0,0,0,0,0,0,1,0,1


# Models

**Support Vector Classifier**

In [43]:
# import svc
from sklearn.svm import SVC 
svc = SVC() 
svc.fit(X_train, y_train)  

y_pred = svc.predict(X_test)

In [44]:
# import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  

svc_train_acc = accuracy_score(y_train, svc.predict(X_train)) 
svc_test_acc = accuracy_score(y_test, y_pred)   

print(f"Training accuracy of SVC: {svc_train_acc}")
print(f"Test accuracy of SVC: {svc_test_acc}")  

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

Training accuracy of SVC: 0.765
Test accuracy of SVC: 0.755
[[151   0]
 [ 49   0]]
              precision    recall  f1-score   support

           N       0.76      1.00      0.86       151
           Y       0.00      0.00      0.00        49

    accuracy                           0.76       200
   macro avg       0.38      0.50      0.43       200
weighted avg       0.57      0.76      0.65       200



**KNN**

In [45]:
from sklearn.neighbors import KNeighborsClassifier  
knn = KNeighborsClassifier() 
knn.fit(X_train, y_train)  

y_pred = knn.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

knn_train_acc = accuracy_score(y_train, knn.predict(X_train))  
knn_test_acc = accuracy_score(y_test, y_pred) 

print(f"Training accuracy of KNN: {knn_train_acc}")
print(f"Testing accuracy of KNN: {knn_test_acc}")  

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

Training accuracy of KNN: 0.7775
Testing accuracy of KNN: 0.51
[[81 70]
 [28 21]]
              precision    recall  f1-score   support

           N       0.74      0.54      0.62       151
           Y       0.23      0.43      0.30        49

    accuracy                           0.51       200
   macro avg       0.49      0.48      0.46       200
weighted avg       0.62      0.51      0.54       200



**Decision Tree Classifier**

In [47]:
from sklearn.tree import DecisionTreeClassifier 
dtc = DecisionTreeClassifier()  
dtc.fit(X_train, y_train)  

y_pred = dtc.predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train)) 
dtc_test_acc = accuracy_score(y_test, y_pred) 
print(f"Training Accuracy of DTC: {dtc_train_acc}")
print(f"Test Accuracy of DTC: {dtc_test_acc}")  

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

Training Accuracy of DTC: 1.0
Test Accuracy of DTC: 0.615
[[114  37]
 [ 40   9]]
              precision    recall  f1-score   support

           N       0.74      0.75      0.75       151
           Y       0.20      0.18      0.19        49

    accuracy                           0.61       200
   macro avg       0.47      0.47      0.47       200
weighted avg       0.61      0.61      0.61       200



In [50]:
# tuning
from sklearn.model_selection import GridSearchCV

grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),  
    'min_samples_leaf' : range(2, 10, 1)         
}

grid_search = GridSearchCV(dtc, grid_params, cv=5, n_jobs=-1, verbose=1)  
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 512 candidates, totalling 2560 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [3, 5, 7, 10],
                         'min_samples_leaf': range(2, 10),
                         'min_samples_split': range(2, 10)},
             verbose=1)

In [51]:
print(grid_search.best_params_)  
print(grid_search.best_score_)

{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 8, 'min_samples_split': 2}
0.75375


In [52]:
dtc = grid_search.best_estimator_  
y_pred = dtc.predict(X_test)

In [53]:
# run metrics again to see if scores improve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train)) 
dtc_test_acc = accuracy_score(y_test, y_pred)  
print(f"Training Accuracy of DTC: {dtc_train_acc}")  
print(f"Test Accuracy of DTC: {dtc_test_acc}") 

print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test, y_pred))

Training Accuracy of DTC: 0.76125
Test Accuracy of DTC: 0.745
[[148   3]
 [ 48   1]]
              precision    recall  f1-score   support

           N       0.76      0.98      0.85       151
           Y       0.25      0.02      0.04        49

    accuracy                           0.74       200
   macro avg       0.50      0.50      0.45       200
weighted avg       0.63      0.74      0.65       200



**Random Forest Classifier**  

In [55]:
from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier(criterion='entropy', max_depth=10, max_features='sqrt', 
                            min_samples_leaf=1, min_samples_split=3, n_estimators=140) 
rfc.fit(X_train, y_train) 

y_pred = rfc.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

rfc_train_acc = accuracy_score(y_train, rfc.predict(X_train))  
rfc_test_acc = accuracy_score(y_test, y_pred) 
print(f"Training Accuracy of RFC: {rfc_train_acc}") 
print(f"Test Accuracy of RFC: {rfc_test_acc}")

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))

Training Accuracy of RFC: 0.8475
Test Accuracy of RFC: 0.63
[[116  35]
 [ 39  10]]
              precision    recall  f1-score   support

           N       0.75      0.77      0.76       151
           Y       0.22      0.20      0.21        49

    accuracy                           0.63       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.62      0.63      0.62       200



**Ada Boost Classifier**

In [61]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)

parameters = {
    'n_estimators' : [50, 70, 90, 120, 180, 200],
    'learning_rate' : [0.001, 0.01, 0.1, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, parameters, n_jobs = -1, cv = 5, verbose = 1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5,
             estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                                                max_depth=3,
                                                                                min_samples_leaf=8)),
             n_jobs=-1,
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.001, 0.01, 0.1, 1, 10],
                         'n_estimators': [50, 70, 90, 120, 180, 200]},
             verbose=1)

In [62]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'algorithm': 'SAMME', 'learning_rate': 0.001, 'n_estimators': 50}
0.75375


In [63]:
ada = grid_search.best_estimator_

y_pred = ada.predict(X_test)

In [64]:
ada_train_acc = accuracy_score(y_train, ada.predict(X_train))
ada_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Ada Boost is : {ada_train_acc}")
print(f"Test accuracy of Ada Boost is : {ada_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Ada Boost is : 0.76125
Test accuracy of Ada Boost is : 0.745
[[148   3]
 [ 48   1]]
              precision    recall  f1-score   support

           N       0.76      0.98      0.85       151
           Y       0.25      0.02      0.04        49

    accuracy                           0.74       200
   macro avg       0.50      0.50      0.45       200
weighted avg       0.63      0.74      0.65       200



**Gradient Boost**

In [65]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of gradient boosting classifier

gb_acc = accuracy_score(y_test, gb.predict(X_test))

print(f"Training Accuracy of Gradient Boosting Classifier is {accuracy_score(y_train, gb.predict(X_train))}")
print(f"Test Accuracy of Gradient Boosting Classifier is {gb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, gb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, gb.predict(X_test))}")

Training Accuracy of Gradient Boosting Classifier is 0.85875
Test Accuracy of Gradient Boosting Classifier is 0.485 

Confusion Matrix :- 
[[76 75]
 [28 21]]

Classification Report :- 
               precision    recall  f1-score   support

           N       0.73      0.50      0.60       151
           Y       0.22      0.43      0.29        49

    accuracy                           0.48       200
   macro avg       0.47      0.47      0.44       200
weighted avg       0.61      0.48      0.52       200



**Stochastic Gradient Boosting**

In [66]:
sgb = GradientBoostingClassifier(subsample = 0.90, max_features = 0.70)
sgb.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of stochastic gradient boosting classifier

sgb_acc = accuracy_score(y_test, sgb.predict(X_test))

print(f"Training Accuracy of Stochastic Gradient Boosting is {accuracy_score(y_train, sgb.predict(X_train))}")
print(f"Test Accuracy of Stochastic Gradient Boosting is {sgb_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, sgb.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, sgb.predict(X_test))}")

Training Accuracy of Stochastic Gradient Boosting is 0.8575
Test Accuracy of Stochastic Gradient Boosting is 0.34 

Confusion Matrix :- 
[[ 28 123]
 [  9  40]]

Classification Report :- 
               precision    recall  f1-score   support

           N       0.76      0.19      0.30       151
           Y       0.25      0.82      0.38        49

    accuracy                           0.34       200
   macro avg       0.50      0.50      0.34       200
weighted avg       0.63      0.34      0.32       200



**CatBoost Classifier**

In [None]:
pip install catboost


In [70]:
import catboost


In [71]:
from catboost import CatBoostClassifier

cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 0.5591416	total: 147ms	remaining: 1.32s
1:	learn: 0.5227568	total: 152ms	remaining: 610ms
2:	learn: 0.4928075	total: 158ms	remaining: 368ms
3:	learn: 0.4765196	total: 163ms	remaining: 244ms
4:	learn: 0.4493175	total: 167ms	remaining: 167ms
5:	learn: 0.4311018	total: 173ms	remaining: 115ms
6:	learn: 0.4091438	total: 178ms	remaining: 76.2ms
7:	learn: 0.3885297	total: 183ms	remaining: 45.8ms
8:	learn: 0.3772844	total: 188ms	remaining: 20.9ms
9:	learn: 0.3713038	total: 193ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x23c88430a90>

In [72]:
cat_acc = accuracy_score(y_test, cat.predict(X_test))

print(f"Training Accuracy of Cat Boost Classifier is {accuracy_score(y_train, cat.predict(X_train))}")
print(f"Test Accuracy of Cat Boost Classifier is {cat_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, cat.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, cat.predict(X_test))}")

Training Accuracy of Cat Boost Classifier is 0.83625
Test Accuracy of Cat Boost Classifier is 0.725 

Confusion Matrix :- 
[[136  15]
 [ 40   9]]

Classification Report :- 
               precision    recall  f1-score   support

           N       0.77      0.90      0.83       151
           Y       0.38      0.18      0.25        49

    accuracy                           0.73       200
   macro avg       0.57      0.54      0.54       200
weighted avg       0.68      0.72      0.69       200



**Extra Trees Classifier**

In [73]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of extra trees classifier

etc_acc = accuracy_score(y_test, etc.predict(X_test))

print(f"Training Accuracy of Extra Trees Classifier is {accuracy_score(y_train, etc.predict(X_train))}")
print(f"Test Accuracy of Extra Trees Classifier is {etc_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, etc.predict(X_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, etc.predict(X_test))}")

Training Accuracy of Extra Trees Classifier is 1.0
Test Accuracy of Extra Trees Classifier is 0.725 

Confusion Matrix :- 
[[142   9]
 [ 46   3]]

Classification Report :- 
               precision    recall  f1-score   support

           N       0.76      0.94      0.84       151
           Y       0.25      0.06      0.10        49

    accuracy                           0.73       200
   macro avg       0.50      0.50      0.47       200
weighted avg       0.63      0.72      0.66       200



**LGBM Classifier**

In [74]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of lgbm classifier

lgbm_acc = accuracy_score(y_test, lgbm.predict(X_test))

print(f"Training Accuracy of LGBM Classifier is {accuracy_score(y_train, lgbm.predict(X_train))}")
print(f"Test Accuracy of LGBM Classifier is {lgbm_acc} \n")

print(f"{confusion_matrix(y_test, lgbm.predict(X_test))}\n")
print(classification_report(y_test, lgbm.predict(X_test)))

Training Accuracy of LGBM Classifier is 1.0
Test Accuracy of LGBM Classifier is 0.455 

[[62 89]
 [20 29]]

              precision    recall  f1-score   support

           N       0.76      0.41      0.53       151
           Y       0.25      0.59      0.35        49

    accuracy                           0.46       200
   macro avg       0.50      0.50      0.44       200
weighted avg       0.63      0.46      0.49       200



**Voting Classifier**

In [76]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Support Vector Classifier', svc), ('KNN', knn),  ('Decision Tree', dtc), ('Random Forest', rfc),
               ('Ada Boost', ada), ('XGboost', xgb), ('Gradient Boosting Classifier', gb), ('SGB', sgb),
               ('Cat Boost', cat), ('Extra Trees Classifier', etc), ('LGBM', lgbm)]

vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)

y_pred = vc.predict(X_test)

Learning rate set to 0.5
0:	learn: 0.5591416	total: 5.71ms	remaining: 51.4ms
1:	learn: 0.5227568	total: 11ms	remaining: 43.9ms
2:	learn: 0.4928075	total: 15.6ms	remaining: 36.3ms
3:	learn: 0.4765196	total: 20.6ms	remaining: 30.8ms
4:	learn: 0.4493175	total: 25.6ms	remaining: 25.6ms
5:	learn: 0.4311018	total: 31ms	remaining: 20.7ms
6:	learn: 0.4091438	total: 36.2ms	remaining: 15.5ms
7:	learn: 0.3885297	total: 41.1ms	remaining: 10.3ms
8:	learn: 0.3772844	total: 45.4ms	remaining: 5.05ms
9:	learn: 0.3713038	total: 50.4ms	remaining: 0us


In [77]:
vc_train_acc = accuracy_score(y_train, vc.predict(X_train))
vc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Voting Classifier is : {vc_train_acc}")
print(f"Test accuracy of Voting Classifier is : {vc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Voting Classifier is : 0.84375
Test accuracy of Voting Classifier is : 0.665
[[128  23]
 [ 44   5]]
              precision    recall  f1-score   support

           N       0.74      0.85      0.79       151
           Y       0.18      0.10      0.13        49

    accuracy                           0.67       200
   macro avg       0.46      0.47      0.46       200
weighted avg       0.61      0.67      0.63       200



In [80]:
models = pd.DataFrame({
    'Model' : ['SVC', 'KNN', 'Decision Tree', 'Random Forest','Ada Boost', 'Gradient Boost', 'SGB', 'Cat Boost', 'Extra Trees', 'LGBM', 'Voting Classifier'],
    'Score' : [svc_test_acc, knn_test_acc, dtc_test_acc, rfc_test_acc, ada_test_acc, gb_acc, sgb_acc, cat_acc, etc_acc, lgbm_acc, vc_test_acc]
})


models.sort_values(by = 'Score', ascending = False)

Unnamed: 0,Model,Score
0,SVC,0.755
2,Decision Tree,0.745
4,Ada Boost,0.745
7,Cat Boost,0.725
8,Extra Trees,0.725
10,Voting Classifier,0.665
3,Random Forest,0.63
1,KNN,0.51
5,Gradient Boost,0.485
9,LGBM,0.455


In [81]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', 
       title = 'Models Comparison')