In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb


insurance_df = pd.read_csv("data\insurance.csv")

In [2]:
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
insurance_df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

### Feature Engineering

In [4]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [5]:
features_df= insurance_df.drop('charges', axis=1)
features_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [6]:
features_df= insurance_df.drop('charges', axis=1)
features_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [7]:
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex','smoker', 'region']

numeric_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

full_processor_insurance = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

full_processor_insurance.fit(features_df)
full_processor_insurance.transform(features_df)
insurance_set_mod = pd.DataFrame(full_processor_insurance.transform(features_df), columns=full_processor_insurance.get_feature_names_out())
insurance_set_mod

Unnamed: 0,number__age,number__bmi,number__children,category__sex_female,category__sex_male,category__smoker_no,category__smoker_yes,category__region_northeast,category__region_northwest,category__region_southeast,category__region_southwest
0,0.021739,0.321227,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.000000,0.479150,0.2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.217391,0.458434,0.6,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.326087,0.181464,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.304348,0.347592,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.403820,0.6,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1334,0.000000,0.429379,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1335,0.000000,0.562012,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1336,0.065217,0.264730,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Modelling

In [8]:
from sklearn import linear_model 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(features_df, insurance_df['charges'], test_size=0.2, random_state=123)

### Multiple Linear Regression

In [9]:
mlr_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('model', linear_model.LinearRegression())
])


mlr_reg.fit(X_train, y_train)

y_pred_train = mlr_reg.predict(X_train)
y_pred_test = mlr_reg.predict(X_test)

accuracy_MLR_train = r2_score(y_train, y_pred_train)
accuracy_MLR_test = r2_score(y_test, y_pred_test)

RMSE_MLR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_MLR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_MLR = cross_val_predict(mlr_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_MLR = r2_score(insurance_df['charges'], y_pred_cv_MLR)

print("Coefficients: \n", mlr_reg[-1].coef_)
print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Multiple Linear Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Multiple Linear Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Multiple Linaer Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_MLR))

Coefficients: 
 [ 11759.77708266  12737.26803537   2257.93643729     41.72084482
    -41.72084482 -11750.50550844  11750.50550844    619.7104357
    200.49030407   -413.56324257   -406.6374972 ]
RMSE for Training Data: 6166.40
Training Accuracy for Multiple Linear Regression Model: 0.74
RMSE for Testing Data: 5527.43
Testing Accuracy for Multiple Linear Regression Model: 0.80
Accuracy for 5-Fold Cross Predicted Multiple Linaer Regression Model: 0.75


In [10]:
y_pred_cv_MLR

array([25344.        ,  3840.        ,  7296.        , ...,
        4202.15481526,  1078.70907307, 37023.90505333])

### Polynomial Regression

In [11]:
lr2_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', linear_model.LinearRegression())
])

lr2_reg.fit(X_train, y_train)


y_pred_train = lr2_reg.predict(X_train)
y_pred_test = lr2_reg.predict(X_test)

accuracy_PR_train = r2_score(y_train, y_pred_train)
accuracy_PR_test = r2_score(y_test, y_pred_test)

RMSE_PR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_PR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_PR = cross_val_predict(lr2_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_PR = r2_score(insurance_df['charges'], y_pred_cv_PR)

print("Coefficients: \n",  lr2_reg[-1].coef_)
print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Polynomial Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Polynomial Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Polynomial Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_PR))

Coefficients: 
 [-6.48588736e+16  2.27815079e+16 -1.40519336e+16 -6.78727928e+16
  3.05050109e+16  7.62833372e+16 -7.01677013e+15  9.57998602e+16
 -2.49927064e+16  4.83443675e+16 -1.05475075e+16  8.55200000e+03
 -5.80000000e+01 -1.11600000e+03  4.74031444e+16  4.74031444e+16
  1.18749734e+16  1.18749734e+16  5.58075581e+15  5.58075581e+15
  5.58075581e+15  5.58075581e+15 -7.24000000e+03  3.55250000e+02
 -1.09523058e+15 -1.09523058e+15 -1.82477511e+16 -1.82477511e+16
 -3.43852622e+15 -3.43852622e+15 -3.43852622e+15 -3.43852622e+15
 -2.71200000e+03  1.62422936e+16  1.62422936e+16 -3.06938127e+15
 -3.06938127e+15  8.79021267e+14  8.79021267e+14  8.79021267e+14
  8.79021267e+14  4.78774692e+16  0.00000000e+00 -4.47331322e+15
  2.19002656e+16 -2.73994023e+16  6.08240698e+15 -1.88481350e+16
  5.55379750e+15 -1.15811884e+16 -4.16146805e+16 -1.52411018e+16
 -2.91771812e+16  4.30462810e+15 -2.06259139e+16  3.77601862e+15
 -3.98833026e+16  0.00000000e+00 -4.72001191e+16  2.37060130e+15
 -3.01051

### Decision Tree Regression

In [12]:
dtr_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', DecisionTreeRegressor(max_depth=5, random_state=123))
])

dtr_reg.fit(X_train, y_train)

In [13]:
y_pred_train = dtr_reg.predict(X_train)
y_pred_test = dtr_reg.predict(X_test)

accuracy_DTR_train =  r2_score(y_train, y_pred_train)
accuracy_DTR_test = r2_score(y_test, y_pred_test)

RMSE_DTR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_DTR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_DTR = cross_val_predict(dtr_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_DTR = r2_score(insurance_df['charges'], y_pred_cv_DTR)

print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Decision Tree Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Decision Tree Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Decision Tree Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_DTR))

RMSE for Training Data: 4272.74
Training Accuracy for Decision Tree Regression Model: 0.87
RMSE for Testing Data: 4185.08
Testing Accuracy for Decision Tree Regression Model: 0.89
Accuracy for 5-Fold Cross Predicted Decision Tree Regression Model: 0.83


### Radom Forest Regression

In [14]:
rf_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('model', RandomForestRegressor(max_depth=7, random_state=123))
])

rf_reg.fit(X_train, y_train)


y_pred_train = rf_reg.predict(X_train)
y_pred_test = rf_reg.predict(X_test)

accuracy_RFR_train = r2_score(y_train, y_pred_train)
accuracy_RFR_test = r2_score(y_test, y_pred_test)

RMSE_RFR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_RFR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_RFR = cross_val_predict(rf_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_RFR = r2_score(insurance_df['charges'], y_pred_cv_RFR)


print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Radom Forest Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Radom Forest Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Random Forest Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_RFR))

RMSE for Training Data: 3293.09
Training Accuracy for Radom Forest Regression Model: 0.93
RMSE for Testing Data: 3961.50
Testing Accuracy for Radom Forest Regression Model: 0.90
Accuracy for 5-Fold Cross Predicted Random Forest Regression Model: 0.85


### Xgboost Regression

In [15]:
xgb_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('model', xgb.XGBRegressor(learning_rate = 0.5, gamma = 0.1,
                       objective='reg:squarederror',
                       n_estimators=30, max_depth=10, max_leaves=15, random_state=0))
])

xgb_reg.fit(X_train, y_train)

y_pred_train = xgb_reg.predict(X_train)
y_pred_test = xgb_reg.predict(X_test)

accuracy_XGBR_train = r2_score(y_train, y_pred_train)
accuracy_XGBR_test = r2_score(y_test, y_pred_test)

RMSE_XGBR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_XBGR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_XGBR = cross_val_predict(xgb_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_XGBR = r2_score(insurance_df['charges'], y_pred_cv_XGBR)


print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Xgboost Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Xgboost Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Xgboost Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_XGBR))

RMSE for Training Data: 3141.06
Training Accuracy for Xgboost Regression Model: 0.93
RMSE for Testing Data: 4125.32
Testing Accuracy for Xgboost Regression Model: 0.89
Accuracy for 5-Fold Cross Predicted Xgboost Regression Model: 0.84


In [16]:
xgb_reg = Pipeline(steps=[
    ('preprocess', full_processor_insurance),
    ('model', xgb.XGBRegressor(learning_rate = 0.2, gamma = 0.1,
                       objective='reg:squarederror',
                       n_estimators=52, max_depth=12, max_leaves=20))
])

xgb_reg.fit(X_train, y_train)

y_pred_train = xgb_reg.predict(X_train)
y_pred_test = xgb_reg.predict(X_test)

accuracy_XGBR_train = r2_score(y_train, y_pred_train)
accuracy_XGBR_test = r2_score(y_test, y_pred_test)

RMSE_XGBR_train = mean_squared_error(y_train, y_pred_train, squared=False)
RMSE_XBGR_test = mean_squared_error(y_test, y_pred_test, squared=False)

y_pred_cv_XGBR = cross_val_predict(xgb_reg, X=features_df, y=insurance_df['charges'], cv=5)
accuracy_cv_XGBR = r2_score(insurance_df['charges'], y_pred_cv_XGBR)


print('RMSE for Training Data: %.2f' % mean_squared_error(y_train, y_pred_train, squared=False))
print('Training Accuracy for Xgboost Regression Model: %.2f' % r2_score(y_train, y_pred_train))
print('RMSE for Testing Data: %.2f' % mean_squared_error(y_test, y_pred_test, squared=False))
print('Testing Accuracy for Xgboost Regression Model: %.2f' % r2_score(y_test, y_pred_test))
print('Accuracy for 5-Fold Cross Predicted Xgboost Regression Model: %.2f' % r2_score(insurance_df['charges'], y_pred_cv_XGBR))


RMSE for Training Data: 3281.28
Training Accuracy for Xgboost Regression Model: 0.93
RMSE for Testing Data: 3989.46
Testing Accuracy for Xgboost Regression Model: 0.90
Accuracy for 5-Fold Cross Predicted Xgboost Regression Model: 0.85


In [17]:

# reg_pipeline = Pipeline(steps=[
#     ('preprocess', full_processor_insurance),
#     ('model', xgb.XGBRegressor())
# ])

# params = {
#     'model__learning_rate': [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5],
#     'model__gamma': [0.01, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5],
#     'model__n_estimators': [15, 20, 30, 40, 45, 50, 55],
#     'model__max_depth': [6, 8, 10, 12, 14, 16, 18],

# }


# reg= GridSearchCV(estimator=reg_pipeline,
#                    param_grid=params,
#                    scoring='r2',
#                    verbose=1)

# reg.fit(X_train, y_train)

# print("Best parameters:", reg.best_params_)
# print("Highest r2 score: ", reg.best_score_)
# print("Best model: ", reg.best_estimator_)
