In [None]:
# Import libraries. You may or may not use all of these.
%pip install -q git+https://github.com/tensorflow/docs
%pip install scikit-learn
%pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from keras.callbacks import EarlyStopping

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.isna().sum()

In [None]:
np.random.seed(42)
msk = np.random.rand(len(dataset)) < 0.8

train = dataset[msk]

test = dataset[~msk]

In [None]:
train_dataset = train.drop(['expenses'], axis=1)
train_labels = train['expenses']

test_dataset = test.drop(['expenses'], axis=1)
test_labels = test['expenses']

In [None]:
print(train_dataset.shape)
print(train_labels.shape)
print(test_dataset.shape)
print(test_labels.shape)

In [None]:
train_labels.hist()
print(train_labels)

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 10))

sex_plt = train_dataset['sex'].value_counts()
ax[0].bar(sex_plt.index, sex_plt.values)

smoker_plt = train_dataset['smoker'].value_counts()
ax[1].bar(smoker_plt.index, smoker_plt.values)

region_plt = train_dataset['region'].value_counts()
ax[2].bar(region_plt.index, region_plt.values)

In [None]:
# Transform the data of the training dataset

one_hot_enc = OneHotEncoder(handle_unknown='error')

one_hot_enc.fit(train_dataset[['sex', 'smoker', 'region']])
codes = one_hot_enc.transform(train_dataset[['sex', 'smoker', 'region']]).toarray()

feature_names_one_hot = one_hot_enc.get_feature_names_out(['sex', 'smoker', 'region'])

minmaxscaler = MinMaxScaler()
transformed_train_dataset = minmaxscaler.fit_transform(train_dataset[['age', 'bmi', 'children']])

train_dataset['age'] = transformed_train_dataset[:, 0]
train_dataset['bmi'] = transformed_train_dataset[:, 1]
train_dataset['children'] = transformed_train_dataset[:, 2]

codes_df = pd.DataFrame(data=codes, columns=feature_names_one_hot)
train_dataset = pd.concat([train_dataset.reset_index(drop=True), codes_df.reset_index(drop=True)], axis=1)
train_dataset.drop(labels=['sex', 'smoker', 'region'], axis=1, inplace=True)
train_dataset.head(10)

In [None]:
codes = one_hot_enc.transform(test_dataset[['sex', 'smoker', 'region']]).toarray()

feature_names_one_hot = one_hot_enc.get_feature_names_out(['sex', 'smoker', 'region'])

transformed_test_dataset = minmaxscaler.transform(test_dataset[['age', 'bmi', 'children']])

test_dataset['age'] = transformed_test_dataset[:, 0]
test_dataset['bmi'] = transformed_test_dataset[:, 1]
test_dataset['children'] = transformed_test_dataset[:, 2]

codes_df = pd.DataFrame(data=codes, columns=feature_names_one_hot)
test_dataset = pd.concat([test_dataset.reset_index(drop=True), codes_df.reset_index(drop=True)], axis=1)
test_dataset.drop(labels=['sex', 'smoker', 'region'], axis=1, inplace=True)
test_dataset.head(10)

In [None]:
# Normalize the target labels to improve the training phase and reduce the size of the gradient used to update the weights and result in a more stable model and training process
minmaxscaler_target = MinMaxScaler()
train_labels = minmaxscaler_target.fit_transform(train_labels.values.reshape(-1, 1)).flatten()
test_labels = minmaxscaler_target.transform(test_labels.values.reshape(-1, 1)).flatten()

In [None]:
# Plot the features

features = ['age', 'bmi', 'children']
features_encoded = ['sex_female', 'sex_male', 'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']
print(features)
for feature in features:
    sns.jointplot(data=train_dataset, x=feature, y=train_labels, kind="reg")
plt.show()

# Boxplot the binary features
for i, feature in enumerate(features_encoded):
    plt.figure(i)
    sns.boxplot(data=train_dataset, x=feature, y=train_labels)
plt.show()

In [None]:
# sns.pairplot(train_dataset[features])

In [None]:
# Pearson correlation -> correlate the features with the expenses target label

#corr_matrix = train_dataset.corr(numeric_only=True)
#corr_matrix_age = train_dataset['age'].sort_values(ascending=False)

In [None]:
# Verify that everything is in order to pass to the model

# Train dataset
print(train_dataset.shape)
print(train_labels.shape)
train_dataset.info()

# Test dataset
print(test_dataset.shape)
print(test_labels.shape)
test_dataset.info()

In [None]:
# Model

model = keras.Sequential(
    [
        layers.Dense(units=100, activation="relu", input_shape=(train_dataset.shape[1],)),
        layers.Dense(units=50, activation="relu"),
        layers.Dense(units=25, activation="relu"),
        layers.Dense(units=1),
    ]
)

model.compile(optimizer='adam',
              loss=tf.keras.losses.MeanAbsoluteError(),
              metrics=['mae', 'mse'])

model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss',
                   mode='min',
                   patience=50,
                   restore_best_weights = True)

history = model.fit(train_dataset, train_labels,
                    validation_data = (test_dataset, test_labels),
                    callbacks=[es],
                    epochs=50,
                    batch_size=50,
                    verbose=1)

print(model.metrics_names)

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 1]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
