## **Library**

In [62]:
import time 
import pickle

import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 

## **Load Data**

In [63]:
data = pd.read_csv('Prediction Insurance.csv')
data.head(1)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1


## Exploring Dataset 

In [64]:
data.shape

(381109, 12)

In [65]:
data.groupby(['Previously_Insured','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Previously_Insured,Unnamed: 1_level_2,Unnamed: 2_level_2
0,159929,46552
1,174470,158


In [66]:
data.groupby(['Gender','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Gender,Unnamed: 1_level_2,Unnamed: 2_level_2
Female,156835,18185
Male,177564,28525


In [67]:
data.groupby(['Driving_License','Response']).agg({'id':'count'}).unstack()

Unnamed: 0_level_0,id,id
Response,0,1
Driving_License,Unnamed: 1_level_2,Unnamed: 2_level_2
0,771,41
1,333628,46669


### Selecting Relevant Features for the analysis and model training.

In [68]:
data = data[['Gender','Driving_License','Previously_Insured','Response']]
data.head(5)

Unnamed: 0,Gender,Driving_License,Previously_Insured,Response
0,Male,1,0,1
1,Male,1,0,0
2,Male,1,0,1
3,Male,1,1,0
4,Female,1,1,0


### Preprocessing

In [69]:
#preprocessing
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
data.head(5)

Unnamed: 0,Gender,Driving_License,Previously_Insured,Response
0,0,1,0,1
1,0,1,0,0
2,0,1,0,1
3,0,1,1,0
4,1,1,1,0


In [70]:
data.head(5)

Unnamed: 0,Gender,Driving_License,Previously_Insured,Response
0,0,1,0,1
1,0,1,0,0
2,0,1,0,1
3,0,1,1,0
4,1,1,1,0


### Data Splitting and Modeling

#### K-Nearest Neighbors (KNN)

In [71]:
#data modelling
x = data.drop('Response', axis=1) #inputan
y = data['Response'] #outputan

#proses belajar dan cara belajar
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)
start = time.time()
model = KNeighborsClassifier(n_neighbors=3)

#hasil belajar
model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time {stop-start} Sekon.....")

Training Time 0.07907581329345703 Sekon.....


In [72]:
#model export
with open('modelKNN.pkl','wb') as file:
    pickle.dump(model, file)

In [73]:
#model evaluation
y_predict = model.predict(x_test)
print(classification_report(y_predict, y_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93     76222
           1       0.00      0.00      0.00         0

    accuracy                           0.88     76222
   macro avg       0.50      0.44      0.47     76222
weighted avg       1.00      0.88      0.93     76222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### KNN is a simple and effective algorithm that assigns a class to a data point based on the majority class of its nearest neighbors.
#### Evaluation: The performance metrics (like precision, recall, F1-score) are printed using classification_report.

#### Random Forest model

In [76]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time
import pickle

# Data Preparation
x = data.drop('Response', axis=1)  # Features
y = data['Response']  # Target variable

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize and train the model
start = time.time()
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time: {stop - start:.2f} seconds")

# Save the model
with open('modelRandomForest.pkl', 'wb') as file:
    pickle.dump(model, file)
print("Random Forest model saved successfully!")

# Evaluate the model
y_predict = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_predict))
print("Model Evaluation:")
print(classification_report(y_test, y_predict))


Training Time: 2.76 seconds
Random Forest model saved successfully!
Accuracy: 0.8750623179659416
Model Evaluation:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.00      0.00      0.00      9523

    accuracy                           0.88     76222
   macro avg       0.44      0.50      0.47     76222
weighted avg       0.77      0.88      0.82     76222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Random Forest is an ensemble learning method that combines multiple decision trees to improve classification performance.
#### Parameters: n_estimators=100: Number of decision trees in the forest, random_state=42: Ensures reproducibility.

In [79]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Resample the data
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)

# Split the resampled data
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)

# Evaluate the model
y_predict = model.predict(x_test)
print("Model Evaluation after SMOTE:\n")
print(classification_report(y_test, y_predict))


Model Evaluation after SMOTE:

              precision    recall  f1-score   support

           0       0.99      0.53      0.69     66775
           1       0.68      1.00      0.81     66985

    accuracy                           0.76    133760
   macro avg       0.83      0.76      0.75    133760
weighted avg       0.83      0.76      0.75    133760



#### SMOTE (Synthetic Minority Oversampling Technique) balances the dataset by creating synthetic samples for the minority class.
#### Performance is re-evaluated after resampling to observe improvements in metrics like recall and F1-score for the minority class.

#### XGBoost Classifier

In [77]:
# XGBoost Classifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time
import pickle

# Data Preparation
x = data.drop('Response', axis=1)  # Features
y = data['Response']  # Target variable

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = XGBClassifier(
    n_estimators=100,  # Number of trees
    max_depth=5,       # Maximum depth of each tree
    learning_rate=0.1, # Step size for weight updates
    scale_pos_weight=1, # Handles class imbalance by weighting classes differently
    objective='binary:logistic', # For binary classification
    use_label_encoder=False,
    random_state=42
)

start = time.time()
model.fit(x_train, y_train)
stop = time.time()
print(f"Training Time: {stop - start:.2f} seconds")

# Save the model
with open('modelXGBoost.pkl', 'wb') as file:
    pickle.dump(model, file)
print("XGBoost model saved successfully!")

# Evaluate the model
y_predict = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_predict))
print("Model Evaluation:")
print(classification_report(y_test, y_predict))


Parameters: { "use_label_encoder" } are not used.



Training Time: 0.24 seconds
XGBoost model saved successfully!
Accuracy: 0.8750623179659416
Model Evaluation:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     66699
           1       0.00      0.00      0.00      9523

    accuracy                           0.88     76222
   macro avg       0.44      0.50      0.47     76222
weighted avg       0.77      0.88      0.82     76222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### XGBoost (Extreme Gradient Boosting) is a powerful and efficient boosting algorithm for classification tasks.
