In [1]:
#Importing the necessary packages
#Importing packages for data manipulation
import numpy as np
import pandas as pd

#Importing packages for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

Loading the dataset for analysis

In [2]:
#Load the dataset into a dataframe
#Dataset is stored locally in the system
df_train = pd.read_csv(r"C:\Users\skkav\Documents\Data Analytics\Python\Binary Classification of Insurance Cross Selling\Dataset\train.csv")

#Displaying the first five rows of the dataset
df_train.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


Data Exploration - Initial Exploratory Data Analysis(EDA) and Data Cleaning

To build an impactful model that meets the requirements and facilitates decision-making, one has to have a complete idea about the data they are dealing with. A useful machine learning model can be built only when the data that is fed into the model is useful as well without any avoidable errors and data mismatch. Due to the above reasons, EDA and Data Cleaning is considered as the main step to build any machine learning model.

In [3]:
#Gathering basic information about the data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [4]:
#Gathering descriptive statistics about the data
df_train.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0
mean,5752398.0,38.38356,0.998022,26.41869,0.4629966,30461.37,112.4254,163.8977,0.1229973
std,3321149.0,14.99346,0.0444312,12.99159,0.4986289,16454.75,54.03571,79.97953,0.3284341
min,0.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,2876199.0,24.0,1.0,15.0,0.0,25277.0,29.0,99.0,0.0
50%,5752398.0,36.0,1.0,28.0,0.0,31824.0,151.0,166.0,0.0
75%,8628598.0,49.0,1.0,35.0,1.0,39451.0,152.0,232.0,0.0
max,11504800.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [5]:
#The numbers above are displayed in scientific notation. Converting them into a more readable format
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(df_train.describe())

                id          Age  Driving_License  Region_Code  \
count 11504798.000 11504798.000     11504798.000 11504798.000   
mean   5752398.500       38.384            0.998       26.419   
std    3321149.255       14.993            0.044       12.992   
min          0.000       20.000            0.000        0.000   
25%    2876199.250       24.000            1.000       15.000   
50%    5752398.500       36.000            1.000       28.000   
75%    8628597.750       49.000            1.000       35.000   
max   11504797.000       85.000            1.000       52.000   

       Previously_Insured  Annual_Premium  Policy_Sales_Channel      Vintage  \
count        11504798.000    11504798.000          11504798.000 11504798.000   
mean                0.463       30461.370               112.425      163.898   
std                 0.499       16454.745                54.036       79.980   
min                 0.000        2630.000                 1.000       10.000   
25%           

In [6]:
#Obtaining the list of all columns
df_train.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [7]:
#Dropping the unnecessary columns
df_train.drop(columns='Vehicle_Age', inplace=True)

In [8]:
#Displaying the columns after dropping Vehicle_Age column
df_train.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [9]:
#Converting categorical column into numeric column
df_train['Vehicle_Damage'] = df_train['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

df_train['Gender'] = df_train['Gender'].map({'Female': 1, 'Male': 0})

In [10]:
#Displaying the dataset after converting the values
df_train.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,0,21,1,35.0,0,1,65101.0,124.0,187,0
1,1,0,43,1,28.0,0,1,58911.0,26.0,288,1
2,2,1,25,1,14.0,1,0,38043.0,152.0,254,0
3,3,1,35,1,1.0,0,1,2630.0,156.0,76,0
4,4,1,36,1,15.0,1,0,31951.0,152.0,294,0


In [11]:
#Checking for missing values in the dataset
df_train.isna().sum()

#We can observe that there are no missing values

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [12]:
#Checking for duplicate values in the dataset
df_train.duplicated().sum()

#We can observe that there are no duplicate values

np.int64(0)

In [13]:
#Finding the unique values in the target column and the count of their occurence
df_train['Response'].value_counts()

Response
0    10089739
1     1415059
Name: count, dtype: int64

In [18]:
#Applying the pre-processing steps on the test dataset
df_test = pd.read_csv(r"C:\Users\skkav\Documents\Data Analytics\Python\Binary Classification of Insurance Cross Selling\Dataset\test.csv")

df_test.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [19]:
#Dropping the unnecessary columns
df_test.drop(columns='Vehicle_Age', inplace=True)

#Converting categorical column into numeric column
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

df_test['Gender'] = df_test['Gender'].map({'Female': 1, 'Male': 0})

In [20]:
#Displaying the test data set after pre-processing
df_test.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,1,20,1,47.0,0,0,2630.0,160.0,228
1,11504799,0,47,1,28.0,0,1,37483.0,124.0,123
2,11504800,0,47,1,43.0,0,1,2630.0,26.0,271
3,11504801,1,22,1,47.0,1,0,24502.0,152.0,115
4,11504802,0,51,1,19.0,0,0,34115.0,124.0,148


We'll build  2 models:
1. Decision Tree Classifier
2. XGBoost Classifier

In [21]:
#Importing the necessary packages to build a decision tree classifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

In [22]:
#Prepare features and labels from the training set
#train_df = df_train
#test_df = df_test

X = df_train.drop(columns=['Response'])
y = df_train['Response']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
decision_tree_clf = DecisionTreeClassifier(random_state=42)
decision_tree_clf.fit(X_train, y_train)

decision_y_pred_valid = decision_tree_clf.predict(X_valid)


In [23]:
#Displaying the accuracy
decision_tree_accuracy = accuracy_score(y_valid, decision_y_pred_valid)
print(f'Validation accuracy of decision tree model is: {decision_tree_accuracy}')

Validation accuracy of decision tree model is: 0.8300544120714832


In [25]:
#Running prediction on test data
decision_y_pred = decision_tree_clf.predict(df_test)

In [26]:
df_submission = df_test[['id']]
df_submission.loc[:, 'Response'] = decision_y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_submission.loc[:, 'Response'] = decision_y_pred


In [27]:
df_submission.to_csv('decision_tree_submission.csv', index=False)
print("Submission file created")

Submission file created
