In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("insurance_cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,1,18,33.77,1,1725.55,0,1,1,0,0,0,1,0
2,2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,3,33,22.705,0,21984.47,0,1,1,0,0,1,0,0
4,4,32,28.88,0,3866.86,0,1,1,0,0,1,0,0


In [3]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.55,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.86,0,1,1,0,0,1,0,0


In [4]:
target = np.array(df.charges)
print(target)

[16884.92  1725.55  4449.46 ...  1629.83  2007.94 29141.36]


In [5]:
# Remove target an unnecessary columns
data = df.drop(['charges', 'sex_male', 'smoker_yes'], axis = 1)
data.head()

Unnamed: 0,age,bmi,children,sex_female,smoker_no,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,0,0,1
1,18,33.77,1,0,1,0,0,1,0
2,28,33.0,3,0,1,0,0,1,0
3,33,22.705,0,0,1,0,1,0,0
4,32,28.88,0,0,1,0,1,0,0


In [6]:
data_x = df.drop(['charges', 'sex_male', 'smoker_yes'], axis = 1)
data_x = list(data_x.columns.values)

In [7]:
# Convert to numpy array
data = np.array(data)

In [8]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [10]:
print('Training Data Shape:', X_train.shape)
print('Training Target Shape:', y_train.shape)
print('Testing Data Shape:', X_test.shape)
print('Testing Target Shape:', y_test.shape)

Training Data Shape: (1003, 9)
Training Target Shape: (1003,)
Testing Data Shape: (335, 9)
Testing Target Shape: (335,)


In [11]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [12]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [13]:
# Train the model on training data
rf.fit(X_train, y_train);

In [14]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

In [15]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

In [16]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2565.94


In [17]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

In [18]:
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 72.42 %.


In [19]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

In [20]:
# Pull out one tree from the forest
tree = rf.estimators_[5]

In [21]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = data_x, rounded = True, precision = 1)

In [22]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [23]:
# Write graph to a png file
graph.write_png('tree.png')

In [24]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [25]:
# Extract the small tree
tree_small = rf_small.estimators_[5]

In [26]:
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = data_x, rounded = True, precision = 1)

In [27]:
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

In [28]:
graph.write_png('small_tree.png')

In [29]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

In [30]:
# List of tuples with variable and importance
data_importances = [(data, round(importance, 2)) for data, importance in zip(data_x, importances)]

In [31]:
# Sort the feature importances by most important first
data_importances = sorted(data_importances, key = lambda x: x[1], reverse = True)

In [32]:
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in data_importances]

Variable: smoker_no            Importance: 0.61
Variable: bmi                  Importance: 0.21
Variable: age                  Importance: 0.14
Variable: children             Importance: 0.02
Variable: sex_female           Importance: 0.01
Variable: region_northeast     Importance: 0.01
Variable: region_northwest     Importance: 0.0
Variable: region_southeast     Importance: 0.0
Variable: region_southwest     Importance: 0.0


[None, None, None, None, None, None, None, None, None]

In [33]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

In [34]:
# Extract the two most important features
important_indices = [data_x.index('smoker_no'), data_x.index('bmi'), data_x.index('age'), 
                     data_x.index('children'), data_x.index('sex_female'), data_x.index('region_northeast')]
train_important = X_train[:, important_indices]
test_important = X_test[:, important_indices]

In [35]:
# Train the random forest
rf_most_important.fit(train_important, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)

In [37]:
errors = abs(predictions - y_test)

In [38]:
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2546.59


In [39]:
mapes = np.mean(100 * (errors / y_test))
accuracy_important = 100 - mapes

In [40]:
print('Accuracy:', round(accuracy_important, 2), '%.')

Accuracy: 73.9 %.


In [41]:
# People usually think age is the major factor in determining health insurance cost, but surprisingly it is third on the list.
# Whether or not a person smokes and those with unhealthy BMIs have more importance placed on them than age, with the smokers 
# variable having almost three times as much importance than the second most important factor