In [2]:
# import pickle
import numpy as np
import pandas as pd

In [3]:
data = pd.read_pickle('D:\Projects\health-expense-estimator\data\df.pkl')

In [4]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   object 
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   expenses  1337 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 83.6+ KB


# Feature Engineering

### 1. Missing Values

- Data had no missing values

### 2. Duplicate data

- There was one row of duplicate data, already removed during EDA

### 3. Outlier Treatment

In [6]:
print(data['bmi'].skew())
print(data['age'].skew())
print(data['expenses'].skew())
print(data['children'].skew())

0.28446275054239506
0.054780773126998195
1.5153909165486397
0.9374206440474123


- Above shows some outliers in bmi and expenses columns

In [7]:
# outlier treatment using IQR method

def treat_outlier(col):
    
    # defining IQR
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    print(f'Column Name: {col}')
    print(f'First quantile: {Q1}')
    print(f'Third quantile: {Q3}')
    
    # defining upper and lower limits
    upper = Q3 + IQR*1.5
    lower = Q1 - IQR*1.5
    print(f'Lower Bound: {round(lower, 2)}')
    print(f'Upper Bound: {round(upper, 2)}')
    print("="*50)

    # imputing outliers with upper and lower bounds
    data.loc[(data[col]>upper),col] = upper
    data.loc[(data[col]<lower),col] = lower

    return data[col]

In [8]:
for col in ['age', 'bmi', 'expenses']:

    treat_outlier(col)

Column Name: age
First quantile: 27.0
Third quantile: 51.0
Lower Bound: -9.0
Upper Bound: 87.0
Column Name: bmi
First quantile: 26.3
Third quantile: 34.7
Lower Bound: 13.7
Upper Bound: 47.3
Column Name: expenses
First quantile: 4746.34
Third quantile: 16657.72
Lower Bound: -13120.73
Upper Bound: 34524.79


### 4. Separating target variable from rest of the dataframe

In [9]:
X = data.iloc[:, :-1]
y = data['expenses']

In [10]:
print(X.columns)

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region'], dtype='object')


### 5. Data transformation

In [11]:
# importing required libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
    ('numeric_transformer', num_transformer, num_features),
    ('cat_transformer', ohe_transformer, cat_features)
    ]
)

In [12]:
X = preprocessor.fit_transform(X)

In [13]:
X

array([[-1.44041773, -0.45436319,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-1.51164747,  0.51967087,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.79935006,  0.38759846,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.51164747,  1.03145148,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.29795825, -0.80105328,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.55123139, -0.25625457,  1.        , ...,  1.        ,
         0.        ,  0.        ]])

# Model building

### Train-Test Split

In [14]:
# importing libraries
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [16]:
X_train.shape, X_test.shape

((1069, 16), (268, 16))

### Model Training

In [17]:
# importing models

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# importing metrices

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [18]:
# model evaluation function

def model_eval(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r_square = r2_score(true, predicted) 

    return mae, rmse, r_square

In [19]:
models = {
    'linear-regression' : LinearRegression(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'decision-tree' : DecisionTreeRegressor(),
    'random-forest' : RandomForestRegressor(),
    'knn' : KNeighborsRegressor(),
    'svm' : SVR(),
    'adaboost': AdaBoostRegressor(),
    'xgboost' : XGBRegressor(),
    'catboost' : CatBoostRegressor()
}

model_list = []
training_r_sqr = []
test_r_sqr = []

In [20]:
list(models)

['linear-regression',
 'lasso',
 'ridge',
 'decision-tree',
 'random-forest',
 'knn',
 'svm',
 'adaboost',
 'xgboost',
 'catboost']

In [21]:
list(models.values())

[LinearRegression(),
 Lasso(),
 Ridge(),
 DecisionTreeRegressor(),
 RandomForestRegressor(),
 KNeighborsRegressor(),
 SVR(),
 AdaBoostRegressor(),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...),
 <catboost.core.CatBoostRegressor at 0x2059db59730>]

In [22]:
#training the model
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

# making predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

# evaluation
    train_mae, train_rmse, train_r_square = model_eval(y_train, y_train_pred)
    test_mae, test_rmse, test_r_square = model_eval(y_test, y_test_pred)

# printing all info
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('-'*50)

    print(f'Training mean absolute error:{train_mae}')
    print(f'Training RMSE:{train_rmse}')
    print(f'Training R Square error:{train_r_square}')
    print('-'*50)
    training_r_sqr.append(train_r_square)

    print(f'Test mean absolute error:{test_mae}')
    print(f'Test RMSE:{test_rmse}')
    print(f'Test R Square error:{test_r_square}')
    print('='*50)
    test_r_sqr.append(test_r_square)

linear-regression
--------------------------------------------------
Training mean absolute error:3487.3098993100602
Training RMSE:5246.32674438919
Training R Square error:0.734565915729743
--------------------------------------------------
Test mean absolute error:3084.985439262496
Test RMSE:4466.885764582852
Test R Square error:0.8031287822305123
lasso
--------------------------------------------------
Training mean absolute error:3487.8025046728208
Training RMSE:5246.3441720218025
Training R Square error:0.7345641522500315
--------------------------------------------------
Test mean absolute error:3086.000408618795
Test RMSE:4466.836346614352
Test R Square error:0.8031331382504933
ridge
--------------------------------------------------
Training mean absolute error:3492.066313262622
Training RMSE:5246.390851369955
Training R Square error:0.7345594287980182
--------------------------------------------------
Test mean absolute error:3092.917001283333
Test RMSE:4469.733628055543
Test R

### Results

In [23]:
result = pd.DataFrame(list(zip(model_list, training_r_sqr, test_r_sqr)), 
                      columns=['Model Name', 'Training R2_Score', 'Test R2_Score']).sort_values(by=['Test R2_Score'],ascending=False)
result


Unnamed: 0,Model Name,Training R2_Score,Test R2_Score
9,catboost,0.934883,0.860478
4,random-forest,0.965943,0.854169
7,adaboost,0.773653,0.814233
8,xgboost,0.992084,0.805715
1,lasso,0.734564,0.803133
0,linear-regression,0.734566,0.803129
2,ridge,0.734559,0.802878
5,knn,0.801719,0.772605
3,decision-tree,0.997642,0.699269
6,svm,-0.089929,-0.06479
