# Car Insurance Claim Prediction

In [None]:
# Predict whether the policyholder will file a claim in the next 6 months or not.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# EDA

In [4]:
# display the data
train.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [5]:
test.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating
0,ID58593,0.341732,0.0,0.586538,C3,4076,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
1,ID58594,0.307241,0.13,0.442308,C8,8794,1,B2,M6,Petrol,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2
2,ID58595,0.327924,0.12,0.451923,C8,8794,2,A,M3,Petrol,...,No,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2
3,ID58596,0.782654,0.01,0.461538,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0
4,ID58597,1.233404,0.02,0.634615,C5,34738,1,A,M1,CNG,...,No,No,No,No,Yes,No,No,No,Yes,0


In [10]:
train.shape

(58592, 44)

In [11]:
test.shape

(39063, 43)

In [7]:
# check missing values 
train.isna().sum()

policy_id                           0
policy_tenure                       0
age_of_car                          0
age_of_policyholder                 0
area_cluster                        0
population_density                  0
make                                0
segment                             0
model                               0
fuel_type                           0
max_torque                          0
max_power                           0
engine_type                         0
airbags                             0
is_esc                              0
is_adjustable_steering              0
is_tpms                             0
is_parking_sensors                  0
is_parking_camera                   0
rear_brakes_type                    0
displacement                        0
cylinder                            0
transmission_type                   0
gear_box                            0
steering_type                       0
turning_radius                      0
length      

In [8]:
# check missing values 
test.isna().sum()

policy_id                           0
policy_tenure                       0
age_of_car                          0
age_of_policyholder                 0
area_cluster                        0
population_density                  0
make                                0
segment                             0
model                               0
fuel_type                           0
max_torque                          0
max_power                           0
engine_type                         0
airbags                             0
is_esc                              0
is_adjustable_steering              0
is_tpms                             0
is_parking_sensors                  0
is_parking_camera                   0
rear_brakes_type                    0
displacement                        0
cylinder                            0
transmission_type                   0
gear_box                            0
steering_type                       0
turning_radius                      0
length      

In [12]:
# check the balance
train['is_claim'].value_counts()/train.shape[0] * 100

# the dataset is imbalance

0    93.603222
1     6.396778
Name: is_claim, dtype: float64

# Feature engineering

In [13]:
#we need to drop some features that are not important for our analysis and then divide the i/op features
#the output feature is assigned to Y column.
X=train.drop(columns=['policy_id','is_claim',"area_cluster","make",
                     "model","fuel_type","airbags",
                      "cylinder"])
y=train['is_claim']

In [14]:
#Dropping the same features from test set also.
policy_id=test['policy_id']
test=test.drop(['policy_id',"area_cluster","make",
                     "model","fuel_type","airbags",
                      "cylinder"],axis=1)

In [17]:
X=pd.get_dummies(X,drop_first=True)
X.shape

(58592, 64)

In [18]:
test=pd.get_dummies(test,drop_first=True)
test.shape

(39063, 64)

In [21]:
## RandomOverSampler to handle imbalanced data

from imblearn.over_sampling import RandomOverSampler

In [22]:
os =  RandomOverSampler(random_state=42, sampling_strategy=0.8)

In [24]:
X_res, y_res = os.fit_resample(X, y)
X_res.shape,y_res.shape

((98719, 64), (98719,))

In [25]:
#Scaling the data set
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_res=scaler.fit_transform(X_res)
test=scaler.transform(test)

In [26]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 54844, 1: 3748})
Resampled dataset shape Counter({0: 54844, 1: 43875})


# Dataset spliting

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
#Test-train splitting
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size = 0.2, random_state = 1)

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import lightgbm

from sklearn.metrics import f1_score

In [33]:
#Tuning the hyper parameters for Randomforest classifier
clf=RandomForestClassifier(n_estimators=1000,
                         criterion='gini',
                         max_depth=12,
                         max_features='log2',
                         min_samples_leaf=1,
                         min_samples_split=5,
                         random_state=1)

In [34]:
# Train Model
clf.fit(X_train,y_train)

# Training set performance
train_pred=clf.predict(X_train)
train_accuracy=f1_score(y_train,train_pred)

#Testing set performance

test_pred=clf.predict(X_test)
test_accuracy=f1_score(y_test,test_pred)


print('Accuracy for Training set is')
print( 100*train_accuracy)
print('----------------------------------')
print('Accuracy for Testing set is')
print( 100*test_accuracy)

Accuracy for Training set is
77.4588734320278
----------------------------------
Accuracy for Testing set is
74.41494149414942


In [38]:
y_pred=clf.predict(test)

In [39]:
#Creating the submission file
submission = pd.DataFrame()
submission['policy_id'] = policy_id
submission['is_claim'] = y_pred
submission.to_csv('submission3.csv', index=None)

In [40]:
submission.shape

(39063, 2)

In [41]:
submission

Unnamed: 0,policy_id,is_claim
0,ID58593,1
1,ID58594,0
2,ID58595,0
3,ID58596,0
4,ID58597,0
...,...,...
39058,ID97651,0
39059,ID97652,1
39060,ID97653,0
39061,ID97654,0
