In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataframe-for-toy-fastapi/insurance.csv


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

In [3]:
df = pd.read_csv("/kaggle/input/dataframe-for-toy-fastapi/insurance.csv")

In [5]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
49,23,106.6,1.58,2.29,False,Kota,student,Medium
62,34,72.8,1.83,35.67,False,Chennai,business_owner,Low
25,59,60.2,1.55,30.0,False,Mysore,government_job,Low
4,69,62.2,1.6,3.94,True,Indore,retired,High
90,59,54.0,1.6,21.07,False,Mumbai,business_owner,Low


In [6]:
df_feat = df.copy() 

In [7]:
# Feature 1: BMi
df_feat["bmi"] = df_feat["weight"]/(df_feat["height"]**2)


In [8]:
#feature 2 : Age Group 
def age_group(age):
    if age < 25:
        return "young"
    elif age< 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"
        

In [9]:
df_feat["age_group"] = df_feat["age"].apply(age_group)


In [12]:
#feature 3 : Lifestyle Risk
def lifestyle_risk(row):
    if row["smoker"] and row["bmi"]>30:
        return "high"
    elif row["smoker"] or row["bmi"] > 27:
        return "medium"
    else:
        return "low"

In [13]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

In [14]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [15]:
# feature 4 : city Tier

def city_tier (city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else: 
        return 3
        

In [18]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [19]:
df_feat.sample(2)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
8,73,58.0,1.58,1.78,False,Chandigarh,retired,Medium,23.233456,senior,low,2
93,23,79.4,1.85,1.28,False,Indore,student,Low,23.199416,young,low,2


In [20]:
df_feat.drop(columns=['age','weight','height','smoker','city'])

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,High,49.227482,senior,medium,2
1,34.28000,freelancer,Low,30.189017,adult,medium,1
2,36.64000,freelancer,Low,21.118382,adult,low,2
3,3.34000,student,Medium,45.535900,young,high,1
4,3.94000,retired,High,24.296875,senior,medium,2
...,...,...,...,...,...,...,...
95,19.64000,business_owner,Low,21.420747,adult,low,2
96,34.01000,private_job,Low,47.984483,adult,medium,1
97,44.86000,freelancer,Low,18.765432,middle_aged,low,1
98,28.30000,business_owner,Low,30.521676,adult,medium,1


In [21]:
x = df_feat[["bmi","age_group","lifestyle_risk","city_tier","income_lpa","occupation"]]
y = df_feat["insurance_premium_category"]

In [22]:
x


Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [25]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [26]:
categorical_features = ["age_group","lifestyle_risk","occupation","city_tier"]
numeric_feature = ["bmi","income_lpa"]

In [27]:
preprocessor = ColumnTransformer(transformers=[
    ("cat",OneHotEncoder(),categorical_features),
    ("num","passthrough",numeric_feature)
])

In [28]:
pipeline = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",RandomForestClassifier(random_state=42))
    
])

In [29]:
X_train , X_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state =1)
pipeline.fit(X_train,y_train)

In [30]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test,y_pred)

0.9

In [31]:
X_test.sample(3)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
84,28.801497,senior,medium,2,0.62,retired
10,22.949982,adult,medium,1,32.78,business_owner
82,17.874812,adult,low,1,12.96,unemployed


In [40]:
import pickle
# save the trained pipelin eusing pickle 
pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
    pickle.dump(pipeline,f)