In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import sklearn

In [8]:
insurance_data = pd.read_csv("./insurance.csv", sep=",", header=0)
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [9]:
### Standardizing data
bmi = insurance_data.bmi
bmi -= bmi.mean()
bmi /= bmi.std()

age = insurance_data.age
age -= age.mean()
age /= age.std()

children = insurance_data.children
children -= children.mean()
children /= children.std()

charges = insurance_data.charges
charges -= charges.mean()
charges /= charges.std()

In [None]:
### Split Data into Train and Test
train_ratio = 0.9
train_index = int(insurance_data.shape[0] * train_ratio)

bmi_train = bmi[:train_index]
age_train = age[:train_index]
children_train = children[:train_index]
charges_train = charges[:train_index]

bmi_test = bmi[train_index:]
age_test = age[train_index:]
children_test = children[train_index:]
charges_test = charges[train_index:]

pandas.core.series.Series

In [15]:
# create the training and testing sets
train_data = list(zip(bmi_train, age_train, children_train))
train_results = list(charges_train)

test_data = list(zip(bmi_test, age_test, children_test))
test_results = list(charges_test)

In [16]:
# fit the Bayesian Ridge model
clf = sklearn.linear_model.BayesianRidge()
clf.fit(train_data, train_results)

In [None]:
# predict on the test data
predicted_mean, predicted_std = clf.predict(test_data, return_std=True)
assert len(predicted_mean) == len(test_results)

list

In [32]:
diff = predicted_mean - test_results
M = len(predicted_mean)
RMSE = np.sqrt(1/M * sum(diff**2))

print(RMSE)

1.009334400886113


# Analysis

Based on the RMSE, the predictions are decent. Since the charges are normalized prior to model fitting or prediction, a RMSE value of approximately 1 isn't great, but it isn't terrible either.

By computing the predictive mean, the uncertainty of the predicted values is lost. This could be remedied by computing the standard deviation of the difference between the actual value and all of the predicted results. This would keep the uncertainty and would still allow for computing the predictive mean for the RMSE.