In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [105]:
df_train_raw1 = pd.read_csv('datasets/train.csv')
df_test = pd.read_csv('datasets/test.csv')

In [107]:
df_train_raw1['rowct'] = 1
summary_region = df_train_raw1.groupby('Region_Code').sum('Response').sort_values('Response', ascending=True)
summary_region['ResponseRate'] = summary_region['Response'] / summary_region['rowct']
summary_region[['ResponseRate', 'Response', 'rowct']].sort_values('ResponseRate', ascending=False).head(3)

Unnamed: 0_level_0,ResponseRate,Response,rowct
Region_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28.0,0.191811,661951,3451062
38.0,0.19042,11537,60587
51.0,0.187766,353,1880


In [115]:
df_train_raw = df_train_raw1.merge(summary_region[['ResponseRate']], on='Region_Code', how='left')
df_train_raw['ResponseRate'] = df_train_raw['ResponseRate'].fillna(0)
df_train_raw['AgeGroup'] = np.where((df_train_raw['Age'] >= 25) & (df_train_raw['Age'] <= 72) , 1, 0)

df_test = df_test.merge(summary_region[['ResponseRate']], on='Region_Code', how='left')
df_test['ResponseRate'] = df_test['ResponseRate'].fillna(0)
df_test['AgeGroup'] = np.where((df_test['Age'] >= 25 ) & (df_test['Age'] <= 72 ) , 1, 0)

In [116]:
df_train_raw.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response', 'rowct', 'ResponseRate',
       'AgeGroup'],
      dtype='object')

In [117]:
df_train_0 = df_train_raw[df_train_raw['Response'] == 0].sample(frac = 0.4, random_state = 42)
df_train_1 = df_train_raw[df_train_raw['Response'] == 1]
df_train = pd.concat([df_train_0, df_train_1], axis = 0)

In [118]:
df_train_raw.pivot_table( columns = 'Vehicle_Age', values = 'Response', aggfunc = 'mean')

Vehicle_Age,1-2 Year,< 1 Year,> 2 Years
Response,0.177725,0.041404,0.299049


In [119]:
df_train.value_counts('Response')

Response
0    4035896
1    1415059
Name: count, dtype: int64

In [136]:
from sklearn.model_selection import train_test_split

# Split the dataframe into features (X) and target variable (y)
X = df_train.drop('Response', axis=1)
y = df_train['Response']

X['Gender1'] = np.where(X['Gender'] == 'Male', 1, 0)
X['Vehicle_Damage1'] = np.where(X['Vehicle_Damage'] == 'Yes', 1, 0)
X['Vehicle_Age_over2'] = np.where(X['Vehicle_Age'] == '> 2 Years', 1, 0)



df_test['Gender1'] = np.where(df_test['Gender'] == 'Male', 1, 0)
df_test['Vehicle_Damage1'] = np.where(df_test['Vehicle_Damage'] == 'Yes', 1, 0)
df_test['Vehicle_Age_over2'] = np.where(df_test['Vehicle_Age'] == '> 2 Years', 1, 0)


# Select the categorical variables for building the decision tree
categorical_vars = ['Vehicle_Damage1', 'Driving_License', 'Previously_Insured']
numeric_vars = ['Annual_Premium', 'Age']
varused = categorical_vars + numeric_vars

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [137]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_categorical = X_train[varused]

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Fit the decision tree classifier on the categorical variables
dt_classifier.fit(X_categorical, y_train)

# Predict the response variable for the testing set
y_pred = dt_classifier.predict(X_test[varused])


# Print the classification report
print('categorical_vars: ', categorical_vars)
print('numeric_vars: ', numeric_vars)
print(classification_report(y_test, y_pred))

categorical_vars:  ['Vehicle_Damage1', 'Driving_License', 'Previously_Insured']
numeric_vars:  ['Annual_Premium', 'Age']
              precision    recall  f1-score   support

           0       0.83      0.87      0.85   1211034
           1       0.57      0.47      0.52    424253

    accuracy                           0.77   1635287
   macro avg       0.70      0.67      0.68   1635287
weighted avg       0.76      0.77      0.76   1635287



Model 4:

categorical_vars = ['Vehicle_Damage1', 'Driving_License', 'Vehicle_Age_over2', 'Previously_Insured', 'AgeGroup']
numeric_vars = ['Annual_Premium', 'ResponseRate']
varused = categorical_vars + numeric_vars

              precision    recall  f1-score   support

           0       0.83      0.87      0.85   1211034
           1       0.58      0.51      0.54    424253

    accuracy                           0.78   1635287
   macro avg       0.71      0.69      0.70   1635287
weighted avg       0.77      0.78      0.77   1635287


Model 5:
categorical_vars:  ['Vehicle_Damage1', 'Driving_License', 'Previously_Insured', 'AgeGroup']
numeric_vars:  ['Annual_Premium', 'ResponseRate']
              precision    recall  f1-score   support

           0       0.84      0.87      0.85   1211034
           1       0.58      0.51      0.54    424253

    accuracy                           0.78   1635287
   macro avg       0.71      0.69      0.70   1635287
weighted avg       0.77      0.78      0.77   1635287



In [138]:
y_pred1 = pd.DataFrame(y_pred)

In [139]:
y_pred1.shape

(1635287, 1)

In [140]:
y_pred1.value_counts()

0
0    1283313
1     351974
Name: count, dtype: int64

In [141]:

# Use the decision tree model to predict the test dataset
X_test_categorical = df_test[varused]
y_pred_test = dt_classifier.predict(X_test_categorical)

# Create a dataframe with the predicted values
df_test_pred = pd.DataFrame({'id': df_test['id'], 'Response': y_pred_test})

# Print the predicted dataframe
df_test_pred.to_csv('datasets/test_pred.csv', index=False)

In [130]:
df_test_pred.value_counts('Response')

Response
0    6322409
1    1347457
Name: count, dtype: int64

In [131]:
df_test_pred.head()

Unnamed: 0,id,Response
0,11504798,0
1,11504799,1
2,11504800,0
3,11504801,0
4,11504802,0


In [132]:
X[varused].corr()

Unnamed: 0,Vehicle_Damage1,Driving_License,Vehicle_Age_over2,Previously_Insured,AgeGroup,Annual_Premium,ResponseRate
Vehicle_Damage1,1.0,-0.0103,0.197589,-0.851235,0.236563,0.025158,0.230117
Driving_License,-0.0103,1.0,-0.004954,0.008888,1.3e-05,-0.006892,-0.010886
Vehicle_Age_over2,0.197589,-0.004954,1.0,-0.184602,0.104359,0.070232,0.14403
Previously_Insured,-0.851235,0.008888,-0.184602,1.0,-0.237012,-0.004382,-0.214908
AgeGroup,0.236563,1.3e-05,0.104359,-0.237012,1.0,-0.009994,0.269317
Annual_Premium,0.025158,-0.006892,0.070232,-0.004382,-0.009994,1.0,0.326881
ResponseRate,0.230117,-0.010886,0.14403,-0.214908,0.269317,0.326881,1.0


In [133]:
df_train_raw.pivot_table(index= 'Age', values = 'Response', aggfunc = 'mean')

Unnamed: 0_level_0,Response
Age,Unnamed: 1_level_1
20,0.022438
21,0.031871
22,0.032953
23,0.033138
24,0.031142
...,...
81,0.058824
82,0.011765
83,0.049383
84,0.000000


Look into continuous variables vs response

In [45]:
df_train_raw[['Response', 'Age', 'Annual_Premium', 'Vintage']].corr()

Unnamed: 0,Response,Age,Annual_Premium,Vintage
Response,1.0,0.122134,0.032261,-0.015177
Age,0.122134,1.0,0.056327,-0.013293
Annual_Premium,0.032261,0.056327,1.0,0.003284
Vintage,-0.015177,-0.013293,0.003284,1.0


In [78]:
df_train_raw['AgeGroup'] = pd.cut(df_train_raw['Age'], bins = [0, 25, 73, 100], labels = ['0-24', '25-72', '73-100'])
df_train_raw.value_counts('AgeGroup')



AgeGroup
25-72     7783596
0-24      3538611
73-100     182591
Name: count, dtype: int64

Look into categorical variables vs response

In [79]:
summary_region = df_train_raw.pivot_table(index = 'AgeGroup', values = 'Response', aggfunc = 'mean')
summary_region.sort_values('Response', ascending = False)
#df_train_raw['Region_Code'].value_counts().sort_values(ascending = False)


  summary_region = df_train_raw.pivot_table(index = 'AgeGroup', values = 'Response', aggfunc = 'mean')


Unnamed: 0_level_0,Response
AgeGroup,Unnamed: 1_level_1
25-72,0.166255
73-100,0.050189
0-24,0.031603


In [82]:
df_train_raw.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')