In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [9]:
df_train_raw = pd.read_csv('datasets/train.csv')

In [10]:
df_train_raw.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [11]:
df_train_0 = df_train_raw[df_train_raw['Response'] == 0].sample(frac = 0.2, random_state = 42)
df_train_1 = df_train_raw[df_train_raw['Response'] == 1]
df_train = pd.concat([df_train_0, df_train_1], axis = 0)

In [12]:
df_train.value_counts('Response')

Response
0    2017948
1    1415059
Name: count, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

# Split the dataframe into features (X) and target variable (y)
X = df_train.drop('Response', axis=1)
y = df_train['Response']

# Select the categorical variables for building the decision tree
categorical_vars = ['Vehicle_Damage1', 'Previously_Insured', 'Driving_License']
numeric_vars = ['Age', 'Annual_Premium']

varused = categorical_vars + numeric_vars

X['Gender1'] = np.where(X['Gender'] == 'Male', 1, 0)
X['Vehicle_Damage1'] = np.where(X['Vehicle_Damage'] == 'Yes', 1, 0)

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_categorical = X_train[varused]

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Fit the decision tree classifier on the categorical variables
dt_classifier.fit(X_categorical, y_train)

# Predict the response variable for the testing set
y_pred = dt_classifier.predict(X_test[varused])


# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78    605180
           1       0.68      0.72      0.70    424723

    accuracy                           0.75   1029903
   macro avg       0.74      0.74      0.74   1029903
weighted avg       0.75      0.75      0.75   1029903



In [21]:
y_pred1 = pd.DataFrame(y_pred)

In [22]:
y_pred1.shape

(1029903, 1)

In [25]:
y_pred1.value_counts()

0
0    580344
1    449559
Name: count, dtype: int64

In [None]:
# Load the test.csv file
df_test = pd.read_csv('datasets/test.csv')

df_test['Gender1'] = np.where(df_test['Gender'] == 'Male', 1, 0)
df_test['Vehicle_Damage1'] = np.where(df_test['Vehicle_Damage'] == 'Yes', 1, 0)

df_test.columns

KeyError: "['Vehicle_Damage1'] not in index"

In [29]:
df_test.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage'],
      dtype='object')

In [None]:

# Use the decision tree model to predict the test dataset
X_test_categorical = df_test[varused]
y_pred_test = dt_classifier.predict(X_test_categorical)

# Create a dataframe with the predicted values
df_test_pred = pd.DataFrame({'id': df_test['id'], 'Response': y_pred_test})

# Print the predicted dataframe
df_test_pred.to_csv('datasets/test_pred.csv', index=False)