<a href="https://colab.research.google.com/github/karim-mammadov/Kaggle-Datasets-MyMLProject/blob/main/Insurance_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!kaggle datasets download samialyasin/insurance-data-personal-auto-line-of-business

Dataset URL: https://www.kaggle.com/datasets/samialyasin/insurance-data-personal-auto-line-of-business
License(s): MIT
Downloading insurance-data-personal-auto-line-of-business.zip to /content
  0% 0.00/154k [00:00<?, ?B/s]
100% 154k/154k [00:00<00:00, 67.9MB/s]


In [None]:
!unzip /content/insurance-data-personal-auto-line-of-business.zip

Archive:  /content/insurance-data-personal-auto-line-of-business.zip
  inflating: synthetic_insurance_data.csv  


# DATA CLEANING and DATA VISUALIZATION

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv('/content/synthetic_insurance_data.csv')
df

Unnamed: 0,Age,Is_Senior,Marital_Status,Married_Premium_Discount,Prior_Insurance,Prior_Insurance_Premium_Adjustment,Claims_Frequency,Claims_Severity,Claims_Adjustment,Policy_Type,...,Time_Since_First_Contact,Conversion_Status,Website_Visits,Inquiries,Quotes_Requested,Time_to_Conversion,Credit_Score,Premium_Adjustment_Credit,Region,Premium_Adjustment_Region
0,47,0,Married,86,1-5 years,50,0,Low,0,Full Coverage,...,10,0,5,1,2,99,704,-50,Suburban,50
1,37,0,Married,86,1-5 years,50,0,Low,0,Full Coverage,...,22,0,5,1,2,99,726,-50,Urban,100
2,49,0,Married,86,1-5 years,50,1,Low,50,Full Coverage,...,28,0,4,4,1,99,772,-50,Urban,100
3,62,1,Married,86,>5 years,0,1,Low,50,Full Coverage,...,4,1,6,2,2,2,809,-50,Urban,100
4,36,0,Single,0,>5 years,0,2,Low,100,Full Coverage,...,14,1,8,4,2,10,662,50,Suburban,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,59,1,Single,0,1-5 years,50,0,Low,0,Full Coverage,...,6,1,4,3,2,9,783,-50,Urban,100
9996,18,0,Married,86,1-5 years,50,0,Medium,0,Full Coverage,...,3,1,6,1,3,6,667,50,Urban,100
9997,29,0,Married,86,<1 year,100,0,Low,0,Full Coverage,...,29,1,3,4,3,3,637,50,Urban,100
9998,47,0,Single,0,<1 year,100,0,Medium,0,Liability-Only,...,8,1,2,4,1,13,676,50,Suburban,50


In [None]:
sns.countplot(x='Conversion_Status', data=df)
plt.title('Conversion Status Distribution')
plt.xlabel('Conversion Status')
plt.ylabel('Count')
plt.show()

In [None]:
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()

In [None]:
sns.countplot(x='Marital_Status', hue='Conversion_Status', data=df)
plt.title('Marital Status vs Conversion Status')
plt.show()

In [None]:
sns.countplot(x='Policy_Type', hue='Conversion_Status', data=df)
plt.title('Policy Type vs Conversion Status')
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14,5))
sns.histplot(df['Claims_Frequency'], bins=20, ax=axes[0])
axes[0].set_title('Claims Frequency Distribution')

sns.histplot(df['Claims_Severity'], bins=20, ax=axes[1])
axes[1].set_title('Claims Severity Distribution')

plt.show()

In [None]:
df['Premium_Adjustment_Region'].unique()

array([ 50, 100,   0])

In [None]:
df.isna().sum()

Unnamed: 0,0
Age,0
Is_Senior,0
Marital_Status,0
Married_Premium_Discount,0
Prior_Insurance,0
Prior_Insurance_Premium_Adjustment,0
Claims_Frequency,0
Claims_Severity,0
Claims_Adjustment,0
Policy_Type,0


In [None]:
df.duplicated().sum()

0

In [None]:
df.columns

Index(['Age', 'Is_Senior', 'Marital_Status', 'Married_Premium_Discount',
       'Prior_Insurance', 'Prior_Insurance_Premium_Adjustment',
       'Claims_Frequency', 'Claims_Severity', 'Claims_Adjustment',
       'Policy_Type', 'Policy_Adjustment', 'Premium_Amount',
       'Safe_Driver_Discount', 'Multi_Policy_Discount', 'Bundling_Discount',
       'Total_Discounts', 'Source_of_Lead', 'Time_Since_First_Contact',
       'Conversion_Status', 'Website_Visits', 'Inquiries', 'Quotes_Requested',
       'Time_to_Conversion', 'Credit_Score', 'Premium_Adjustment_Credit',
       'Region', 'Premium_Adjustment_Region'],
      dtype='object')

In [None]:
df.dtypes

Unnamed: 0,0
Age,int64
Is_Senior,int64
Marital_Status,object
Married_Premium_Discount,int64
Prior_Insurance,object
Prior_Insurance_Premium_Adjustment,int64
Claims_Frequency,int64
Claims_Severity,object
Claims_Adjustment,int64
Policy_Type,object


In [None]:
df['Premium_Adjustment_Credit'].nunique()

2

In [None]:
df['Premium_Adjustment_Credit'].dtype

dtype('int64')

In [None]:
df['Prior_Insurance'].dtype

dtype('O')

In [None]:
df['Prior_Insurance'].unique()

array(['1-5 years', '>5 years', '<1 year'], dtype=object)

# Build a Model

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=13)

In [None]:
X = df.drop('Conversion_Status', axis=1)
y = df['Conversion_Status'].copy()

In [None]:
num_features = X_train.select_dtypes(include=[np.number]).columns
cat_features = X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('encode', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encode', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

transformer = ColumnTransformer([
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)

],remainder='passthrough')

estimator=LogisticRegression(penalty='l2', solver='lbfgs', random_state=13)

full_pipeline=Pipeline([
    ('preprocessing', transformer),
    ('estimator', estimator)
])

In [None]:
full_pipeline.fit(X_train,y_train)

In [None]:
full_pipeline.score(X_train,y_train),full_pipeline.score(X_test,y_test)

1.0

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
estimators = [
    ('rf', RandomForestClassifier(random_state=13)),
    ('svm', SVC(probability=True, random_state=13)),
    ('dt', DecisionTreeClassifier(random_state=13))
]
final_estimator = LogisticRegression(penalty='l2', solver='lbfgs', random_state=13)


stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator)



In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', transformer),
    ('stacking', stacking_model)
])

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
full_pipeline.score(X_train,y_train),,pipeline.score(X_test,y_test)

1.0