# Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import Dataset

In [2]:
path = './insurance.csv'
in_df = pd.read_csv(path)
in_df.info()
in_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Checking for duplicates, null values, and inconsistent data values

In [3]:
in_df_copy = in_df.copy()

In [4]:
in_df.duplicated().sum()

1

In [5]:
in_df  = in_df.drop_duplicates()
in_df.duplicated().sum()

0

In [6]:
# checking for na values
in_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
def check_inconsistent_values(df):
    for col in df:
        series = df[col]
        print(f" Series -->{col} unique values are {series.unique()} \n")

In [8]:
check_inconsistent_values(in_df)

 Series -->age unique values are [19 18 28 33 32 31 46 37 60 25 62 23 56 27 52 30 34 59 63 55 22 26 35 24
 41 38 36 21 48 40 58 53 43 64 20 61 44 57 29 45 54 49 47 51 42 50 39] 

 Series -->sex unique values are ['female' 'male'] 

 Series -->bmi unique values are [27.9   33.77  33.    22.705 28.88  25.74  33.44  27.74  29.83  25.84
 26.22  26.29  34.4   39.82  42.13  24.6   30.78  23.845 40.3   35.3
 36.005 32.4   34.1   31.92  28.025 27.72  23.085 32.775 17.385 36.3
 35.6   26.315 28.6   28.31  36.4   20.425 32.965 20.8   36.67  39.9
 26.6   36.63  21.78  30.8   37.05  37.3   38.665 34.77  24.53  35.2
 35.625 33.63  28.    34.43  28.69  36.955 31.825 31.68  22.88  37.335
 27.36  33.66  24.7   25.935 22.42  28.9   39.1   36.19  23.98  24.75
 28.5   28.1   32.01  27.4   34.01  29.59  35.53  39.805 26.885 38.285
 37.62  41.23  34.8   22.895 31.16  27.2   26.98  39.49  24.795 31.3
 38.28  19.95  19.3   31.6   25.46  30.115 29.92  27.5   28.4   30.875
 27.94  35.09  29.7   35.72  32.205 2

# Assign X and y and Train Test Split

In [9]:
target = 'charges'
y= in_df[target]
X= in_df.drop(columns=[target])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Selecting all categorical Features
Make a column selector for categories & test it /sanity check

In [10]:
#make categorical selector
cat_selector = make_column_selector(dtype_include='object')



In [11]:
cat_selector(X_train)

['sex', 'smoker', 'region']

In [12]:
# create a subset of data for only categorical columns
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data.head()


Unnamed: 0,sex,smoker,region
763,male,no,northeast
1079,male,no,southeast
178,female,no,southwest
287,female,no,northwest
1290,female,no,northeast


# Instantiate & fit One Hot Encoder the nominal/ordinal features

In [13]:
#instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore');
#fit the OneHotEncoder on the training data
ohe_encoder.fit(train_cat_data)
#transform both the training and the testing data
train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data)
train_ohe





array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 1., 0.]])

# Make a DataFrame with the correct column names 

In [14]:
#convert to dataframe, extract new column names from encoder
#set prefixes to original column names
ohe_column_names = ohe_encoder.get_feature_names_out(train_cat_data.columns)
train_ohe = pd.DataFrame(train_ohe, columns=ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns=ohe_column_names)
train_ohe.head()



Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


# Concatenate Numeric and Nominal Columns

In [15]:
# create a numeric selector
num_selector = make_column_selector(dtype_include='number')
# isolate the numeric columns
train_nums = X_train[num_selector(X_train)].reset_index(drop=True)
test_nums = X_test[num_selector(X_test)].reset_index(drop=True)
# re-combine the train and test sets on axis 1 (columns)
X_train_processed = pd.concat([train_nums, train_ohe], axis=1)
X_test_processed = pd.concat([test_nums, test_ohe], axis=1)
X_train_processed.head(10)


Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,27,26.03,0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,63,33.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,46,28.9,2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,63,26.22,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,38,19.95,2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5,28,26.315,3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,25,26.8,3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
7,38,40.565,1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8,42,24.985,2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9,24,25.8,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
