In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# set seed
np.random.seed(1)

In [3]:
# read data for ML
insurance = pd.read_csv("data/insurance.csv")
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance_dummies = pd.get_dummies(data=insurance, columns=['sex', 'smoker', 'region'], drop_first=True)
insurance_dummies.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [5]:
X = insurance_dummies.drop(["charges"], axis=1)
y = insurance_dummies["charges"]

In [6]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1070, 8)
Training Labels Shape: (1070,)
Testing Features Shape: (268, 8)
Testing Labels Shape: (268,)


In [7]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rf.fit(train_features, train_labels)
rf.score(train_features, train_labels)

0.9757898555278071

In [8]:
features_list = list(X)

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'ins_tree.dot', feature_names = features_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('ins_tree.dot')
# Write graph to a png file
graph.write_png('ins_tree.png')

In [9]:
import pickle

# save the model to disk
filename = 'insurance_random_forest.sav'
pickle.dump(rf, open(filename, 'wb'))

In [10]:
insurance_dummies.describe()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265,0.505232,0.204783,0.2429,0.272048,0.2429
std,14.04996,6.098187,1.205493,12110.011237,0.50016,0.403694,0.428995,0.445181,0.428995
min,18.0,15.96,0.0,1121.8739,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.29625,0.0,4740.28715,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,9382.033,1.0,0.0,0.0,0.0,0.0
75%,51.0,34.69375,2.0,16639.912515,1.0,0.0,0.0,1.0,0.0
max,64.0,53.13,5.0,63770.42801,1.0,1.0,1.0,1.0,1.0


In [11]:
# Test case
age = 29
Height = 1.52 # in meters
Weight = 55 # in kg
bmi = Weight/Height**2
children = 0
sex_male = 0
smoker_yes = 0
region_northwest = 0
region_southeast = 1
region_southwest = 0

In [19]:
# load the model from disk
loaded_model = pickle.load(open('insurance_random_forest.sav', 'rb'))
result = loaded_model.score(test_features, test_labels)
print(f'Model score: {result}')
prediction = loaded_model.predict([[age, bmi, children, sex_male, smoker_yes, region_northwest, region_southeast, region_southwest]])
print(f'Our prediction for your annual health insurance cost: ${round(prediction[0],2)}')

Model score: 0.8637498540336975
Our prediction for your annual health insurance cost: $5904.7
