In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("insurance_cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,1,18,33.77,1,1725.55,0,1,1,0,0,0,1,0
2,2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,3,33,22.705,0,21984.47,0,1,1,0,0,1,0,0
4,4,32,28.88,0,3866.86,0,1,1,0,0,1,0,0


In [3]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.92,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.55,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.46,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.86,0,1,1,0,0,1,0,0


In [4]:
target = np.array(df.charges)
print(target)

[16884.92  1725.55  4449.46 ...  1629.83  2007.94 29141.36]


In [5]:
# Remove target an unnecessary columns
data = df.drop(['charges', 'sex_male', 'smoker_yes'], axis = 1)
data.head()

Unnamed: 0,age,bmi,children,sex_female,smoker_no,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,0,0,1
1,18,33.77,1,0,1,0,0,1,0
2,28,33.0,3,0,1,0,0,1,0
3,33,22.705,0,0,1,0,1,0,0
4,32,28.88,0,0,1,0,1,0,0


In [6]:
df['charges'].max()

63770.43

In [7]:
data_x = df.drop(['charges', 'sex_male', 'smoker_yes'], axis = 1)
data_x = list(data_x.columns.values)

In [8]:
# Convert to numpy array
data = np.array(data)

In [9]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [11]:
print('Training Data Shape:', X_train.shape)
print('Training Target Shape:', y_train.shape)
print('Testing Data Shape:', X_test.shape)
print('Testing Target Shape:', y_test.shape)

Training Data Shape: (1003, 9)
Training Target Shape: (1003,)
Testing Data Shape: (335, 9)
Testing Target Shape: (335,)


In [12]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [13]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [14]:
# Train the model on training data
rf.fit(X_train, y_train);

In [15]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

In [16]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

In [17]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2565.94


In [18]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

In [19]:
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 72.42 %.


In [20]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

In [21]:
# Pull out one tree from the forest
tree = rf.estimators_[5]

In [22]:
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = data_x, rounded = True, precision = 1)

In [23]:
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

In [24]:
# Write graph to a png file
graph.write_png('tree.png')

In [25]:
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [26]:
# Extract the small tree
tree_small = rf_small.estimators_[5]

In [27]:
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = data_x, rounded = True, precision = 1)

In [28]:
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

In [29]:
graph.write_png('small_tree.png')

In [30]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

In [31]:
# List of tuples with variable and importance
data_importances = [(data, round(importance, 2)) for data, importance in zip(data_x, importances)]

In [32]:
# Sort the feature importances by most important first
data_importances = sorted(data_importances, key = lambda x: x[1], reverse = True)

In [33]:
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in data_importances]

Variable: smoker_no            Importance: 0.61
Variable: bmi                  Importance: 0.21
Variable: age                  Importance: 0.14
Variable: children             Importance: 0.02
Variable: sex_female           Importance: 0.01
Variable: region_northeast     Importance: 0.01
Variable: region_northwest     Importance: 0.0
Variable: region_southeast     Importance: 0.0
Variable: region_southwest     Importance: 0.0


[None, None, None, None, None, None, None, None, None]

In [34]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)

In [35]:
# Extract the two most important features
important_indices = [data_x.index('smoker_no'), data_x.index('bmi'), data_x.index('age'), 
                     data_x.index('children'), data_x.index('sex_female'), data_x.index('region_northeast')]
train_important = X_train[:, important_indices]
test_important = X_test[:, important_indices]

In [36]:
# Train the random forest
rf_most_important.fit(train_important, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [37]:
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
predictions

array([11112.87975   ,  5308.90062   , 28219.14219   , 10246.07138   ,
       34396.03078   ,  8561.16492   ,  1803.83275   , 14134.06682   ,
        5844.54771   , 10654.43267   , 19496.49824   ,  6912.59832   ,
        5601.89234   , 46012.44193   , 48555.91116   , 45248.98848   ,
       10428.35623   , 42901.51313   , 10032.37545   , 24565.61938   ,
        6338.44286   , 10110.42796   ,  1965.07771   ,  2770.79666   ,
       11896.88588   , 11518.40566   , 13463.45059   ,  4887.34001   ,
       11881.12121   ,  3136.21513957,  7925.24772   , 11888.36244   ,
        2325.68074   ,  5575.70501   ,  3440.53456   , 10733.47779   ,
        3640.33997   ,  8680.46484   , 24029.42614   , 39805.8517    ,
       11519.11412   ,  3327.02144   , 12210.33833   , 14279.40747   ,
        5914.46431   , 14469.89896   , 15973.92779   ,  7618.9612    ,
       41950.10112   ,  6026.90124   , 14035.24418   ,  2375.81562357,
        6932.34119   ,  1759.74209   , 11119.15688   , 10832.70185   ,
      

In [38]:
errors = abs(predictions - y_test)

In [39]:
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 2546.59


In [40]:
mapes = np.mean(100 * (errors / y_test))
accuracy_important = 100 - mapes

In [41]:
print('Accuracy:', round(accuracy_important, 2), '%.')

Accuracy: 73.9 %.


In [42]:
# Calculate the classification report
# from sklearn.metrics import classification_report
# print(classification_report(y_test, predictions, target))

In [43]:
# Calculate classification report
#from sklearn.metrics import classification_report
#print(classification_report(y_test, predictions, target_names=["charges"]))

In [44]:
# Create the SVC Model
# from sklearn.svm import SVC

# model = SVC(kernel='linear')
# model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [50]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 10, 100, 1000, 10000, 100000],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [52]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] C=1, gamma=0.0001 ...............................................


ValueError: Unknown label type: 'continuous'

In [None]:
# People usually think age is the major factor in determining health insurance cost, but surprisingly it is third on the list.
# Whether or not a person smokes and those with unhealthy BMIs have more importance placed on them than age, with the smokers 
# variable having almost three times as much importance than the second most important factor

In [None]:
# Improve accuracy:
# https://www.analyticsvidhya.com/blog/2015/12/improve-machine-learning-results/
    # Ensemble learning - most common approach
    # Treat outliers
    # Normalization
    # Feature creation - deriving new variables to unleash hidden relationships in the dataset
    # Algorithm tuning 
    # Cross validation
    
# https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
    # oob_score

In [None]:
# TODO

# create two line graphs for three clusters in R code
# How do we deal with the region columns?