In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
sns.set(style='whitegrid')

import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv("data.csv")
df.drop(columns="id", inplace=True)
df.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer

train_target=df['Response']
train=df.drop(['Response'], axis = 1)
x_train,x_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)

# mapping 'Gender' col to (0, 1) values
x_train['Gender'] = x_train['Gender'].map( {'Female': 0, 'Male': 1} ).astype(int)


x_train=pd.get_dummies(x_train,drop_first=True)
x_train['Vehicle_Age_< 1 Year'] = x_train['Vehicle_Age_< 1 Year'].astype('int')
x_train['Vehicle_Age_> 2 Years'] = x_train['Vehicle_Age_> 2 Years'].astype('int')
x_train['Vehicle_Damage_Yes'] = x_train['Vehicle_Damage_Yes'].astype('int')


num_feat = ['Age','Vintage']
"""ss = StandardScaler()
x_train[num_feat] = ss.fit_transform(x_train[num_feat])


mm = MinMaxScaler()
x_train[['Annual_Premium']] = mm.fit_transform(x_train[['Annual_Premium']])
x_train.head(2)"""


numeric_transformer = StandardScaler()
min_max_scaler = MinMaxScaler()
preprocessor = ColumnTransformer(
                transformers=[
                    ("StandardScaler", numeric_transformer, num_feat),
                    ("MinMaxScaler", min_max_scaler, ['Annual_Premium'])
                ],
                remainder='passthrough'  # Leaves other columns as they are
            )

final_pipeline = Pipeline(steps=[("Preprocessor", preprocessor)])
input_feature_train_arr = preprocessor.fit_transform(x_train)


x_train.head(2)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
216376,0,49,1,46.0,0,16588.0,26.0,149,0,0,1
154701,1,56,1,41.0,1,2630.0,12.0,143,0,0,1


In [8]:
df.head(2)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

my_params = {'n_estimators': [300], 
             'min_samples_split': [7], 
             'min_samples_leaf': [6],
             'max_depth': [10],
             'criterion': ['entropy']}

clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = my_params, n_iter = 10, 
                               cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
# for above line of code, feel free to make changes if can be made better

model.fit(x_train,y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'criterion': ['entropy'], 'max_depth': [10], 'min_samples_leaf': [6], 'min_samples_split': [7], ...}"
,n_iter,10
,scoring,
,n_jobs,-1
,refit,True
,cv,4
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,101

0,1,2
,n_estimators,300
,criterion,'entropy'
,max_depth,10
,min_samples_split,7
,min_samples_leaf,6
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# 1️⃣ Copy test set to avoid modifying the original
x_test_copy = x_test.copy()

# 2️⃣ Map 'Gender' column
x_test_copy['Gender'] = x_test_copy['Gender'].map({'Female': 0, 'Male': 1}).astype(int)

# 3️⃣ Apply one-hot encoding with same columns as train
x_test_copy = pd.get_dummies(x_test_copy, drop_first=True)

# 4️⃣ Add missing columns (in case some dummy categories were absent in test)
missing_cols = set(x_train.columns) - set(x_test_copy.columns)
for c in missing_cols:
    x_test_copy[c] = 0

# 5️⃣ Ensure same column order as train
x_test_copy = x_test_copy[x_train.columns]

# 6️⃣ Convert categorical dummies to int
x_test_copy['Vehicle_Age_< 1 Year'] = x_test_copy['Vehicle_Age_< 1 Year'].astype('int')
x_test_copy['Vehicle_Age_> 2 Years'] = x_test_copy['Vehicle_Age_> 2 Years'].astype('int')
x_test_copy['Vehicle_Damage_Yes'] = x_test_copy['Vehicle_Damage_Yes'].astype('int')

# 7️⃣ Apply same StandardScaler and MinMaxScaler (use transform, NOT fit_transform)
x_test_copy[num_feat] = ss.transform(x_test_copy[num_feat])
x_test_copy[['Annual_Premium']] = mm.transform(x_test_copy[['Annual_Premium']])

x_test_copy.head(2)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
220145,0,-0.955308,1,8.0,1,0.067184,152.0,1.347143,1,0,0
194704,0,0.141098,1,28.0,0,0.041484,124.0,0.021023,0,0,1


In [15]:
y_pred = model.predict(x_test_copy)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.877453347047587

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93     83603
           1       0.00      0.00      0.00     11675

    accuracy                           0.88     95278
   macro avg       0.44      0.50      0.47     95278
weighted avg       0.77      0.88      0.82     95278


Confusion Matrix:
 [[83602     1]
 [11675     0]]


In [16]:
import joblib

joblib.dump(ss, 'scaler_standard.pkl')
joblib.dump(mm, 'scaler_minmax.pkl')
joblib.dump(model, 'rf_model.pkl')

['rf_model.pkl']

In [None]:
import pandas as pd
import joblib

# Load saved objects
ss = joblib.load('scaler_standard.pkl')
mm = joblib.load('scaler_minmax.pkl')
model = joblib.load('rf_model.pkl')


def preprocess_input(user_input: dict):
    df = pd.DataFrame([user_input])

    # Map Gender
    df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1}).astype(int)

    # Manual encoding for Vehicle_Age and Vehicle_Damage
    df['Vehicle_Age_< 1 Year'] = (df['Vehicle_Age'] == '< 1 Year').astype(int)
    df['Vehicle_Age_> 2 Years'] = (df['Vehicle_Age'] == '> 2 Years').astype(int)
    df['Vehicle_Damage_Yes'] = (df['Vehicle_Damage'] == 'Yes').astype(int)

    # Drop original string columns
    df.drop(['Vehicle_Age', 'Vehicle_Damage'], axis=1, inplace=True)

    # One-hot encode other categorical columns (if any)
    df = pd.get_dummies(df, drop_first=True)

    # Add missing columns
    expected_cols = model.best_estimator_.feature_names_in_
    for col in expected_cols:
        if col not in df.columns:
            df[col] = 0

    # Reorder
    df = df[expected_cols]

    display(df)

    # Scale numeric columns
    df[['Age', 'Vintage']] = ss.transform(df[['Age', 'Vintage']])
    df[['Annual_Premium']] = mm.transform(df[['Annual_Premium']])

    return df

def predict_insurance_response(user_input: dict):
    """
    Predicts insurance response (1 or 0) from raw user input.
    """
    processed = preprocess_input(user_input)
    prediction = model.predict(processed)
    return int(prediction[0])


In [30]:
sample_input = {
    "Gender": "Male",
    "Age": 76,
    "Driving_License": 1,
    "Region_Code": 3.0,
    "Previously_Insured": 0,
    "Vehicle_Age": "1-2 Years",
    "Vehicle_Damage": "No",
    "Annual_Premium": 33536.0,
    "Policy_Sales_Channel": 26.0,
    "Vintage": 183
}

result = predict_insurance_response(sample_input)
print("Predicted Response:", result)


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
0,1,76,1,3.0,0,33536.0,26.0,183,0,0,0


Predicted Response: 0


In [None]:
Male	76	1	3.0	0	1-2 Year	No	33536.0	26.0	183