In [3]:
# importing libraries
import boto3
import pandas as pd; pd.set_option('display.max_columns', 50)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
insurance = pd.read_csv(file_content_stream)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# changing labels to numbers
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

# extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

# appending dummies
insurance = pd.concat([insurance, region_dummies], axis = 1)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [5]:
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & 
                                      (insurance['age'] <= 44.5), 1, 0)

insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & 
                                      (insurance['age'] < 51.5), 1, 0)

insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

In [8]:
# defining input and target variable
X = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

# splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [9]:
# defining parameter grid
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

# performing grid search
RF_grid_search = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', 
                              n_jobs = -1).fit(X_train, Y_train)

# extracting best model
RF_md = RF_grid_search.best_estimator_

# predicting on validation and test
RF_test_pred = RF_md.predict(X_test)

# computing MSE on validation and test
RF_test_mse = mean_squared_error(Y_test, RF_test_pred)
print('The mse of the RF model on the test dataset is', RF_test_mse)

The mse of the RF model on the test dataset is 14898575.067870175


In [None]:
XGBoost_param_grid = {'n_estimators': [500],
                      'max_depth': [3, 5, 7],
                      'min_child_weight': [5, 7],
                      'learning_rate': [0.01],
                      'gamma': [0.3, 0.1],
                      'subsample': [1],
                      'colsample_bytree': [1]}

# performing grid search
XGBoost_grid_search = GridSearchCV(XGBRegressor(), XGBoost_param_grid, cv = 3, scoring = 'neg_mean_squared_error', 
                              n_jobs = -1).fit(X_train, Y_train)

# extracting best model
XGBoost_md = XGBoost_grid_search.best_estimator_

# predicting on validation and test
XGBoost_test_pred = XGBoost_md.predict(X_test)

# computing MSE on validation and test
XGBoost_test_mse = mean_squared_error(Y_test, XGBoost_test_pred)
print('The mse of the XGBoost model on the test dataset is', XGBoost_test_mse)

In [None]:
## XGBoost has a smaller MSE, so I will go ahead and use the XGBoost model to predict charges.