<a href="https://colab.research.google.com/github/junaidkhan035/Insurance-Project/blob/main/Insurance_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np


In [None]:
df = pd.read_csv('/content/drive/MyDrive/insurance.csv')

In [None]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [None]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
32,47,113.7,1.9,50.0,False,Jalandhar,private_job,Medium
71,38,54.1,1.81,20.25,False,Chandigarh,unemployed,Low
56,24,101.9,1.55,2.86,True,Kolkata,student,Medium
35,59,59.3,1.69,43.28,True,Chandigarh,private_job,Medium
33,73,67.5,1.76,1.46,False,Mumbai,retired,Medium


In [None]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [None]:
df_feat = df.copy()

In [None]:
# Feature 1: BMI
df_feat['bmi'] = df_feat['weight'] / (df_feat["height"] ** 2)

In [None]:
# Feature 2: Age Group
def age_group(age):
  if age < 25:
    return 'young'
  elif age < 45:
    return 'adult'
  elif age < 60:
    return 'middle_aged'
  return 'senior'

In [None]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875


In [None]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [None]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior


In [None]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
  if row['smoker'] and row['bmi'] > 30:
    return 'high'
  elif row['smoker'] or row['bmi'] > 30:
    return 'medium'
  else:
    return 'low'

In [None]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk, axis=1)

In [None]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior,medium
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult,medium
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult,low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young,high
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior,medium


In [None]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [None]:
# Feature 4: City Tier
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3

In [None]:
df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [None]:
df_feat.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,67,119.8,1.56,2.92,False,Jaipur,retired,High,49.227482,senior,medium,2
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low,30.189017,adult,medium,1
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult,low,2
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium,45.5359,young,high,1
4,69,62.2,1.6,3.94,True,Indore,retired,High,24.296875,senior,medium,2


In [None]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
78,14.74,freelancer,27.932798,middle_aged,medium,2,High
40,40.19,unemployed,24.349609,adult,medium,1,Medium
24,18.6,private_job,19.669038,middle_aged,low,2,Medium
91,28.467885,government_job,38.675103,adult,medium,1,Low
85,34.66,private_job,14.857209,adult,low,1,Low


In [None]:
# Select features and target
x = df_feat[['bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'income_lpa', 'occupation']]
y = df_feat['insurance_premium_category']

In [None]:
x

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [None]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [None]:
# Define categorical and numeric features
categorical_features = ['age_group', 'lifestyle_risk', 'occupation', 'city_tier']
numeric_features = ['bmi', 'income_lpa']

In [None]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

In [None]:
# Create column transformer for OHE
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [None]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))

])

In [None]:
# Split data and train model
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
pipeline.fit(x_train, y_train)

In [None]:
# Predict and evaluate
y_pred = pipeline.predict(x_test)
accuracy_score(y_test, y_pred)

0.85

In [None]:
x_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
78,27.932798,middle_aged,medium,2,14.74,freelancer
81,31.866055,adult,high,2,22.19,freelancer
17,31.176471,senior,medium,1,2.23,retired
92,18.319942,adult,medium,2,30.0,government_job
82,17.874812,adult,low,1,12.96,unemployed


In [None]:
import pickle

# Save the trained pipeline using pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path, "wb") as f:
    pickle.dump(pipeline, f)