In [258]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [259]:
DATA_PATH = os.path.join(os.path.expanduser("~"), "ayush", "hackathon", "Zonals", 
                        "VidyaSetu-hackathon-utilities", "Supervised learning", "Data", "insurance.csv")
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# EDA

In [260]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [261]:
df.shape

(1338, 7)

In [262]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [263]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [264]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Data Cleaning and preprocessing

In [265]:
#remove duplicates

df.drop_duplicates(inplace=True)
df.shape

(1337, 7)

In [266]:
# total null values

df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [267]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [268]:
#value counts

print(df['sex'].value_counts())
print(df['smoker'].value_counts())
print(df['region'].value_counts())

sex
male      675
female    662
Name: count, dtype: int64
smoker
no     1063
yes     274
Name: count, dtype: int64
region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64


In [269]:
# label encoding

df['sex'] = df['sex'].map({"male" : 0, "female" : 1})
df['smoker'] = df['smoker'].map({"yes" : 1, "no" : 0})

#renaming of column

df.rename(columns={
    "sex" : "is_female",
    "smoker" : "is_smoker"
},inplace=True)
df

Unnamed: 0,age,is_female,bmi,children,is_smoker,region,charges
0,19,1,27.900,0,1,southwest,16884.92400
1,18,0,33.770,1,0,southeast,1725.55230
2,28,0,33.000,3,0,southeast,4449.46200
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830
1334,18,1,31.920,0,0,northeast,2205.98080
1335,18,1,36.850,0,0,southeast,1629.83350
1336,21,1,25.800,0,0,southwest,2007.94500


In [270]:
# generating multiple columns for region

df = pd.get_dummies(df,columns=['region'])


In [271]:
#converting all the columns in 'int' 
df = df.astype(int)

In [272]:
df

Unnamed: 0,age,is_female,bmi,children,is_smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,1,27,0,1,16884,0,0,0,1
1,18,0,33,1,0,1725,0,0,1,0
2,28,0,33,3,0,4449,0,0,1,0
3,33,0,22,0,0,21984,0,1,0,0
4,32,0,28,0,0,3866,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30,3,0,10600,0,1,0,0
1334,18,1,31,0,0,2205,1,0,0,0
1335,18,1,36,0,0,1629,0,0,1,0
1336,21,1,25,0,0,2007,0,0,0,1


In [273]:
df['bmi_category'] = pd.cut(
    df['bmi'],
    bins=[0,18.5,24.9,29.9,float('inf')],
    labels=['Underweight','Normal','Overweight','Obese']
)

df

Unnamed: 0,age,is_female,bmi,children,is_smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,bmi_category
0,19,1,27,0,1,16884,0,0,0,1,Overweight
1,18,0,33,1,0,1725,0,0,1,0,Obese
2,28,0,33,3,0,4449,0,0,1,0,Obese
3,33,0,22,0,0,21984,0,1,0,0,Normal
4,32,0,28,0,0,3866,0,1,0,0,Overweight
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30,3,0,10600,0,1,0,0,Obese
1334,18,1,31,0,0,2205,1,0,0,0,Obese
1335,18,1,36,0,0,1629,0,0,1,0,Obese
1336,21,1,25,0,0,2007,0,0,0,1,Overweight


In [274]:
df = pd.get_dummies(df,columns=['bmi_category'])
df = df.astype(int)
df

Unnamed: 0,age,is_female,bmi,children,is_smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese
0,19,1,27,0,1,16884,0,0,0,1,0,0,1,0
1,18,0,33,1,0,1725,0,0,1,0,0,0,0,1
2,28,0,33,3,0,4449,0,0,1,0,0,0,0,1
3,33,0,22,0,0,21984,0,1,0,0,0,1,0,0
4,32,0,28,0,0,3866,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30,3,0,10600,0,1,0,0,0,0,0,1
1334,18,1,31,0,0,2205,1,0,0,0,0,0,0,1
1335,18,1,36,0,0,1629,0,0,1,0,0,0,0,1
1336,21,1,25,0,0,2007,0,0,0,1,0,0,1,0


In [275]:
from sklearn.preprocessing import StandardScaler

cols = ['age','bmi','children']
scaler = StandardScaler()

df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,age,is_female,bmi,children,is_smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese
0,-1.440418,1,-0.517949,-0.909234,1,16884,0,0,0,1,0,0,1,0
1,-1.511647,0,0.462463,-0.079442,0,1725,0,0,1,0,0,0,0,1
2,-0.799350,0,0.462463,1.580143,0,4449,0,0,1,0,0,0,0,1
3,-0.443201,0,-1.334960,-0.909234,0,21984,0,1,0,0,0,1,0,0
4,-0.514431,0,-0.354547,-0.909234,0,3866,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.767704,0,-0.027743,1.580143,0,10600,0,1,0,0,0,0,0,1
1334,-1.511647,1,0.135659,-0.909234,0,2205,1,0,0,0,0,0,0,1
1335,-1.511647,1,0.952670,-0.909234,0,1629,0,0,1,0,0,0,0,1
1336,-1.297958,1,-0.844753,-0.909234,0,2007,0,0,0,1,0,0,1,0


In [276]:
from scipy.stats import pearsonr

selected_features = [
    'age','bmi','children','is_female','is_smoker','region_northwest',
    'region_northeast','region_southeast','region_southwest','bmi_category_Overweight',
    'bmi_category_Normal','bmi_category_Obese','bmi_category_Underweight'
]

correlation = {
    feature: pearsonr(df[feature],df['charges'])[0] 
    for feature in selected_features
}

corr_df = pd.DataFrame(list(correlation.items()),columns=['Feature','Pearson Correlation'])
corr_df.sort_values(by='Pearson Correlation',ascending=False)

Unnamed: 0,Feature,Pearson Correlation
4,is_smoker,0.787234
0,age,0.298309
11,bmi_category_Obese,0.200348
1,bmi,0.196236
7,region_southeast,0.073577
2,children,0.06739
6,region_northeast,0.005946
5,region_northwest,-0.038695
8,region_southwest,-0.043637
12,bmi_category_Underweight,-0.050599


In [277]:
cat_features = ['is_female', 'children', 'charges',
       'region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest', 'bmi_category_Underweight', 'bmi_category_Normal',
       'bmi_category_Overweight', 'charges_bin']

In [278]:
from scipy.stats import chi2_contingency
import pandas as pd
alpha = 0.05
df['charges_bin'] = pd.qcut (df[ 'charges'], q=4, labels=False) 
chi2_results = {}

for col in cat_features:
    contingency = pd.crosstab (df[col], df['charges_bin']) 
    chi2_stat, p_val, _, _ = chi2_contingency (contingency)
    decision = 'Reject Null (Keep Feature)' if p_val< alpha else 'Accept Null (Drop Feature)' 
    chi2_results [col] = {
        'p_value': p_val,
        'chi2_statistic': chi2_stat,
        'Decision': decision
    }
chir2_df = pd.DataFrame(chi2_results).T
chir2_df = chir2_df.sort_values(by="p_value",ascending=True)
chir2_df

Unnamed: 0,p_value,chi2_statistic,Decision
charges_bin,0.0,4011.0,Reject Null (Keep Feature)
children,0.0,138.659313,Reject Null (Keep Feature)
region_southeast,0.001135,15.998167,Reject Null (Keep Feature)
is_female,0.01649,10.258784,Reject Null (Keep Feature)
charges,0.033723,4011.0,Reject Null (Keep Feature)
region_northeast,0.092122,6.438442,Accept Null (Drop Feature)
region_southwest,0.165191,5.091893,Accept Null (Drop Feature)
bmi_category_Overweight,0.235557,4.25149,Accept Null (Drop Feature)
bmi_category_Normal,0.29476,3.708088,Accept Null (Drop Feature)
bmi_category_Underweight,0.337471,3.37403,Accept Null (Drop Feature)


In [281]:
final_df = df[['charges','children','age','bmi','is_smoker','region_southeast','is_female','bmi_category_Obese']]
final_df

Unnamed: 0,charges,children,age,bmi,is_smoker,region_southeast,is_female,bmi_category_Obese
0,16884,-0.909234,-1.440418,-0.517949,1,0,1,0
1,1725,-0.079442,-1.511647,0.462463,0,1,0,1
2,4449,1.580143,-0.799350,0.462463,0,1,0,1
3,21984,-0.909234,-0.443201,-1.334960,0,0,0,0
4,3866,-0.909234,-0.514431,-0.354547,0,0,0,0
...,...,...,...,...,...,...,...,...
1333,10600,1.580143,0.767704,-0.027743,0,0,0,1
1334,2205,-0.909234,-1.511647,0.135659,0,0,1,1
1335,1629,-0.909234,-1.511647,0.952670,0,1,1,1
1336,2007,-0.909234,-1.297958,-0.844753,0,0,1,0


In [280]:
df

Unnamed: 0,age,is_female,bmi,children,is_smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest,bmi_category_Underweight,bmi_category_Normal,bmi_category_Overweight,bmi_category_Obese,charges_bin
0,-1.440418,1,-0.517949,-0.909234,1,16884,0,0,0,1,0,0,1,0,3
1,-1.511647,0,0.462463,-0.079442,0,1725,0,0,1,0,0,0,0,1,0
2,-0.799350,0,0.462463,1.580143,0,4449,0,0,1,0,0,0,0,1,0
3,-0.443201,0,-1.334960,-0.909234,0,21984,0,1,0,0,0,1,0,0,3
4,-0.514431,0,-0.354547,-0.909234,0,3866,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.767704,0,-0.027743,1.580143,0,10600,0,1,0,0,0,0,0,1,2
1334,-1.511647,1,0.135659,-0.909234,0,2205,1,0,0,0,0,0,0,1,0
1335,-1.511647,1,0.952670,-0.909234,0,1629,0,0,1,0,0,0,0,1,0
1336,-1.297958,1,-0.844753,-0.909234,0,2007,0,0,0,1,0,0,1,0,0
