In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
df_train_raw = pd.read_csv('datasets/train.csv')

In [5]:
df_train_raw.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [6]:
df_train_raw.shape

(11504798, 12)

In [7]:
df_train_raw.pivot_table(index = 'Driving_License', columns = 'Response', values = 'id', aggfunc = 'count')

Response,0,1
Driving_License,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21502,1255
1,10068237,1413804


In [8]:
df_train_raw.pivot_table(index = 'Previously_Insured', columns = 'Response', values = 'id', aggfunc = 'count')

Response,0,1
Previously_Insured,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4766457,1411659
1,5323282,3400


In [9]:
df_train_raw.pivot_table(index = 'Vehicle_Age', columns = 'Response', values = 'id', aggfunc = 'count')

Response,0,1
Vehicle_Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1-2 Year,4919406,1063272
< 1 Year,4835296,208849
> 2 Years,335037,142938


In [10]:
df_train_raw.pivot_table(index = 'Vehicle_Damage', columns = 'Response', values = 'id', aggfunc = 'count')

Response,0,1
Vehicle_Damage,Unnamed: 1_level_1,Unnamed: 2_level_1
No,5697548,24021
Yes,4392191,1391038


In [16]:
df_train = df_train_raw.sample(frac = 0.02, random_state = 42)

In [17]:
df_train.shape

(230096, 12)

In [18]:
from sklearn.model_selection import train_test_split

# Split the dataframe into features (X) and target variable (y)
X = df_train.drop('Response', axis=1)
y = df_train['Response']

X['Gender1'] = np.where(X['Gender'] == 'Male', 1, 0)
X['Vehicle_Damage1'] = np.where(X['Vehicle_Damage'] == 'Yes', 1, 0)

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Select the categorical variables for building the decision tree
categorical_vars = ['Vehicle_Damage1', 'Previously_Insured', 'Driving_License']

X_categorical = X_train[categorical_vars]

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Fit the decision tree classifier on the categorical variables
dt_classifier.fit(X_categorical, y_train)

# Predict the response variable for the testing set
y_pred = dt_classifier.predict(X_test[categorical_vars])


# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93     60532
           1       0.00      0.00      0.00      8497

    accuracy                           0.88     69029
   macro avg       0.44      0.50      0.47     69029
weighted avg       0.77      0.88      0.82     69029



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
y_pred1 = pd.DataFrame(y_pred)

In [21]:
y_pred1.shape

(69029, 1)

In [22]:
y_pred1.value_counts()

0
0    69029
Name: count, dtype: int64

In [23]:
df_train['Response'].value_counts()

Response
0    201714
1     28382
Name: count, dtype: int64

In [24]:
df_train.pivot_table(index = ['Vehicle_Damage', 'Vehicle_Age', 'Previously_Insured'], columns = 'Response', values = 'id', aggfunc = 'count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Response,0,1
Vehicle_Damage,Vehicle_Age,Previously_Insured,Unnamed: 3_level_1,Unnamed: 4_level_1
No,1-2 Year,0,6717.0,264.0
No,1-2 Year,1,34956.0,22.0
No,< 1 Year,0,6058.0,187.0
No,< 1 Year,1,65871.0,6.0
No,> 2 Years,0,6.0,2.0
No,> 2 Years,1,3.0,
Yes,1-2 Year,0,53274.0,20946.0
Yes,1-2 Year,1,3297.0,31.0
Yes,< 1 Year,0,22543.0,4003.0
Yes,< 1 Year,1,2314.0,16.0
