<a href="https://colab.research.google.com/github/jmatt724/MachineLearningFinalProject/blob/main/FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Car Insurance Dataset
### https://www.kaggle.com/datasets/ifteshanajnin/carinsuranceclaimprediction-classification

### Goal is to predict whether a policy holder will file a claim in the next six months

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/car_insurance.csv")

In [None]:
dataset.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [None]:
dataset.columns

Index(['policy_id', 'policy_tenure', 'age_of_car', 'age_of_policyholder',
       'area_cluster', 'population_density', 'make', 'segment', 'model',
       'fuel_type', 'max_torque', 'max_power', 'engine_type', 'airbags',
       'is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors',
       'is_parking_camera', 'rear_brakes_type', 'displacement', 'cylinder',
       'transmission_type', 'gear_box', 'steering_type', 'turning_radius',
       'length', 'width', 'height', 'gross_weight', 'is_front_fog_lights',
       'is_rear_window_wiper', 'is_rear_window_washer',
       'is_rear_window_defogger', 'is_brake_assist', 'is_power_door_locks',
       'is_central_locking', 'is_power_steering',
       'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror',
       'is_ecw', 'is_speed_alert', 'ncap_rating', 'is_claim'],
      dtype='object')

In [None]:
dataset.dtypes

policy_id                            object
policy_tenure                       float64
age_of_car                          float64
age_of_policyholder                 float64
area_cluster                         object
population_density                    int64
make                                  int64
segment                              object
model                                object
fuel_type                            object
max_torque                           object
max_power                            object
engine_type                          object
airbags                               int64
is_esc                               object
is_adjustable_steering               object
is_tpms                              object
is_parking_sensors                   object
is_parking_camera                    object
rear_brakes_type                     object
displacement                          int64
cylinder                              int64
transmission_type               

In [None]:
dataset = dataset.drop(["policy_id","max_torque", "max_power", "engine_type"], axis = 1)

## Encode and get dummies for categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dataset['area_cluster'] = le.fit_transform(dataset['area_cluster'])
dataset['segment'] = le.fit_transform(dataset['segment'])
dataset['model'] = le.fit_transform(dataset['model'])
dataset['fuel_type'] = le.fit_transform(dataset['fuel_type'])
dataset['is_esc'] = le.fit_transform(dataset['is_esc'])
dataset['is_adjustable_steering'] = le.fit_transform(dataset['is_adjustable_steering'])
dataset['is_tpms'] = le.fit_transform(dataset['is_tpms'])
dataset['is_parking_camera'] = le.fit_transform(dataset['is_parking_camera'])
dataset['is_parking_sensors'] = le.fit_transform(dataset['is_parking_sensors'])
dataset['rear_brakes_type'] = le.fit_transform(dataset['rear_brakes_type'])
dataset['transmission_type'] = le.fit_transform(dataset['transmission_type'])
dataset['steering_type'] = le.fit_transform(dataset['steering_type'])

In [None]:
dummy_cols = []
for col in dataset.columns:
  if(dataset[col].dtype == np.object_):
    dummy_cols.append(col)
dataset = pd.get_dummies(dataset, columns=dummy_cols, drop_first = True)

In [None]:
X = dataset.drop("is_claim", axis = 1)
y = dataset["is_claim"]

## Scale the data for improved accuracy

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

dataset = scaler.fit_transform(dataset)

## Dataset is unbalanced

In [None]:
print(dataset['is_claim'].value_counts())

0    54844
1     3748
Name: is_claim, dtype: int64


### solving the problem over unbalanced data in 'is_claim'. RandomOverSampler generates new samples for the minority classes.

In [None]:
from imblearn.over_sampling import RandomOverSampler

rus = RandomOverSampler(random_state = 42)

X, y = rus.fit_resample(X,y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

rand_for = RandomForestClassifier(max_depth = 15, random_state = 42)
rand_for.fit(X_train, y_train)

y_pred = rand_for.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", rf_acc)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.8634454942262003
[[13609  4490]
 [  453 17646]]
              precision    recall  f1-score   support

           0       0.97      0.75      0.85     18099
           1       0.80      0.97      0.88     18099

    accuracy                           0.86     36198
   macro avg       0.88      0.86      0.86     36198
weighted avg       0.88      0.86      0.86     36198



## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
log_acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", log_acc)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.5123211227139621
[[ 7334 10765]
 [ 6888 11211]]
              precision    recall  f1-score   support

           0       0.52      0.41      0.45     18099
           1       0.51      0.62      0.56     18099

    accuracy                           0.51     36198
   macro avg       0.51      0.51      0.51     36198
weighted avg       0.51      0.51      0.51     36198



## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nbClass = MultinomialNB(alpha = 1.0, class_prior =None, fit_prior= True)
nbClass.fit(X_train, y_train)
y_pred = nbClass.predict(X_test)
NB_acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", NB_acc)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.5072655947842423
[[ 7096 11003]
 [ 6833 11266]]
              precision    recall  f1-score   support

           0       0.51      0.39      0.44     18099
           1       0.51      0.62      0.56     18099

    accuracy                           0.51     36198
   macro avg       0.51      0.51      0.50     36198
weighted avg       0.51      0.51      0.50     36198



## K-Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knnClass = KNeighborsClassifier(n_neighbors=5, metric = 'minkowski', p=2)
knnClass.fit(X_train, y_train)
y_pred = knnClass.predict(X_test)
KNN_acc = accuracy_score(y_test, y_pred)
print("Accuracy: ", KNN_acc)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.8763743853251561
[[13682  4417]
 [   58 18041]]
              precision    recall  f1-score   support

           0       1.00      0.76      0.86     18099
           1       0.80      1.00      0.89     18099

    accuracy                           0.88     36198
   macro avg       0.90      0.88      0.87     36198
weighted avg       0.90      0.88      0.87     36198



## Decision Trees

In [None]:
from sklearn import tree

tree = tree.DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_acc = accuracy_score(y_test, y_pred)
y_pred = tree.predict(X_test)
print("Accuracy: ", tree_acc)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.8763743853251561
[[16476  1623]
 [    0 18099]]
              precision    recall  f1-score   support

           0       1.00      0.91      0.95     18099
           1       0.92      1.00      0.96     18099

    accuracy                           0.96     36198
   macro avg       0.96      0.96      0.96     36198
weighted avg       0.96      0.96      0.96     36198



## ANN Classification with GridSearch

In [None]:
print("Train X: ", X_train.shape)
print("Test X: ", X_test.shape)
print("Train y: ", y_train.shape)
print("Test y: ", y_test.shape)

Train X:  (73490, 39)
Test X:  (36198, 39)
Train y:  (73490,)
Test y:  (36198,)


In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

def build_clf(unit):
  ann = Sequential()
  ann.add(Dense(unit, activation = 'relu', input_shape=(39,)))
  ann.add(Dense(unit, activation = 'relu'))
  ann.add(Dense(1, activation = 'sigmoid'))
  ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return ann

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn = build_clf)

  model = KerasClassifier(build_fn = build_clf)


In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'batch_size' : [8,16,32],
    'nb_epoch' : [30, 50],
    'unit' : [6, 12, 18, 24, 30]
}

In [None]:
grid_search = GridSearchCV(estimator = model, param_grid = params)
grid_search = grid_search.fit(X_train, y_train)



In [None]:
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Parameters ", best_parameters)
print("Best Accuracy ", best_accuracy)

Best Parameters  {'batch_size': 8, 'nb_epoch': 30, 'unit': 18}
Best Accuracy  0.510572874546051


In [None]:
y_pred = grid_search.predict(X_test)
y_pred = y_pred > 0.5



In [None]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.96      0.66     18099
           1       0.53      0.04      0.08     18099

    accuracy                           0.50     36198
   macro avg       0.52      0.50      0.37     36198
weighted avg       0.52      0.50      0.37     36198



In [None]:
from prettytable import PrettyTable

x = PrettyTable()
x.field_names = ['Model Name', 'Accuracy']
x.add_row(["Random Forest", rf_acc])
x.add_row(["Logisitc", log_acc])
x.add_row(["Naive Bayes", NB_acc])
x.add_row(["KNN", KNN_acc])
x.add_row(["DT", tree_acc])
x.add_row(["ANN", best_accuracy])
print(x)

+---------------+--------------------+
|   Model Name  |      Accuracy      |
+---------------+--------------------+
| Random Forest | 0.8634454942262003 |
|    Logisitc   | 0.5123211227139621 |
|  Naive Bayes  | 0.5072655947842423 |
|      KNN      | 0.8763743853251561 |
|       DT      | 0.8763743853251561 |
|      ANN      | 0.510572874546051  |
+---------------+--------------------+
