*Note: You are currently reading this using Google Colaboratory which is a cloud-hosted version of Jupyter Notebook. This is a document containing both text cells for documentation and runnable code cells. If you are unfamiliar with Jupyter Notebook, watch this 3-minute introduction before starting this challenge: https://www.youtube.com/watch?v=inN8seMm7UI*

---

In this challenge, you will predict healthcare costs using a regression algorithm.

You are given a dataset that contains information about different people including their healthcare costs. Use the data to predict healthcare costs based on new data.

The first two cells of this notebook import libraries and the data.

Make sure to convert categorical data to numbers. Use 80% of the data as the `train_dataset` and 20% of the data as the `test_dataset`.

`pop` off the "expenses" column from these datasets to create new datasets called `train_labels` and `test_labels`. Use these labels when training your model.

Create a model and train it with the `train_dataset`. Run the final cell in this notebook to check your model. The final cell will use the unseen `test_dataset` to check how well the model generalizes.

To pass the challenge, `model.evaluate` must return a Mean Absolute Error of under 3500. This means it predicts health care costs correctly within $3500.

The final cell will also predict expenses using the `test_dataset` and graph the results.

In [None]:
# Import libraries. You may or may not use all of these.
#!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow.keras.layers.experimental import preprocessing

In [None]:
# Import data
# dataset_path = keras.utils.get_file("insurance.csv", "https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv")
!wget "https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv"
costs = pd.read_csv('insurance.csv')
costs.tail()

In [None]:
# Pandas dataframe to tensorflow.data dataset.
# Requires whole dataframe, with features to use and target.
# https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers
def df_to_ds(df, target, shuffle=True, batch=32):
    features = df.copy()
    labels = features.pop(target)

    ds = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(features))

    ds = ds.batch(batch)
    ds = ds.prefetch(batch)

    return ds

In [None]:
def get_normalization_layer(dataset, feature):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[feature])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
def get_category_encoding_layer(dataset, feature, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer
    # indices.
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[feature])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function
    # captures the layer so we can use them, or include them in the
    # functional model later.
    return lambda x: encoder(index(x))

In [None]:
# Process costs.
# Columns:
# 'age':  integer (categorical?)
# 'sex':  categorical
# 'bmi':  float
# 'children':  integer
# 'smoker':  categorical
# 'region':  categorical
# 'expenses':  float; training target

# Split 70%/20%/10% training/validation/testing.
# pandas.DataFrame.sample():  Set random_state to reproduce partitioning.
# https://stackoverflow.com/a/38251213/12968623

# Data frames.
train_d, val_d, test_d = \
    np.split(costs.sample(frac=1, random_state=42),
                [int(0.70 * len(costs)), int(0.90 * len(costs))])

# Features.
train_f = train_d.copy()
val_f = val_d.copy()
test_f = test_d.copy()

# Labels (targets).
train_l = train_f.pop('expenses')
val_l = val_f.pop('expenses')
test_l = test_f.pop('expenses')

train_ds = df_to_ds(train_d, 'expenses', shuffle=True, batch=256)
val_ds = df_to_ds(val_d, 'expenses', shuffle=False, batch=256)
test_ds = df_to_ds(test_d, 'expenses', shuffle=False, batch=256)

# Build model.

# Numeric features:  age, bmi, children.
numerics = ['age', 'bmi', 'children']
# numerics = ['bmi', 'children']
# numerics = ['bmi']
# Categorical features:  sex, smoker, region.
categoricals = ['sex', 'smoker', 'region']
# categoricals = ['sex', 'smoker']
# categoricals = ['smoker']
# Categorical features as integers:  age.  Maybe?
# categorical_integers = ['age']
categorical_integers = []

inputs = []
encodeds = []

# Numeric features.
for feature in numerics:
    input = tf.keras.Input(shape=(1,), name=feature)
    normalization = get_normalization_layer(train_ds, feature)
    encoded = normalization(input)
    inputs.append(input)
    encodeds.append(encoded)

# Categorical features encoded as integers.
for feature in categorical_integers:
    input = tf.keras.Input(shape=(1,), name=feature, dtype='int64')
    encoding = get_category_encoding_layer(train_ds,
                                            feature,
                                            dtype='int64',
                                            max_tokens=5)
    encoded = encoding(input)
    inputs.append(input)
    encodeds.append(encoded)

# Categorical features encoded as strings.
for feature in categoricals:
    input = tf.keras.Input(shape=(1,), name=feature, dtype='string')
    encoding = get_category_encoding_layer(train_ds,
                                            feature,
                                            dtype='string',
                                            max_tokens=5)
    encoded = encoding(input)
    inputs.append(input)
    encodeds.append(encoded)

encoded_layers = tf.keras.layers.concatenate(encodeds)
x = tf.keras.layers.Dense(4096, activation='relu')(encoded_layers)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.Dense(2048, activation='relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(1024, activation='relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(512, activation='relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.4)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, output)
model.compile(optimizer='adam',
                loss=tf.keras.losses.MeanAbsoluteError(),
                metrics=['mae', 'mse'])

tf.keras.utils.plot_model(model=model,
                            rankdir="LR",
                            dpi=72,
                            show_shapes=True)

epochs = 30
history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=epochs)

test_dataset = test_f
test_labels = test_l

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS?
# Test model by checking how well the model generalizes using the test set.
# loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)
loss, mae, mse = model.evaluate(test_ds, verbose=2)

print("Testing set Mean Abs Error:  ${:9,.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Absolute Error must be less than $3,500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_ds).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)