In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
 #Data Preprocessing

# Display the first 5 rows of the dataset to understand its structure
print("First 5 rows of the dataset:")
print(dataset.head())

# Get information about the columns and their data types
print("\nDataset Info:")
print(dataset.info())

# Check for missing values
print("\nMissing values per column:")
print(dataset.isnull().sum())

# Identify categorical columns that need to be converted to numbers
# Based on typical health datasets and the head/info output,
# 'sex', 'smoker', and 'region' are likely categorical.

# Convert categorical columns to numerical using one-hot encoding
# pandas get_dummies is a convenient function for this
# It creates new binary columns for each category in the specified columns
# drop_first=True is often used to avoid multicollinearity, dropping one category per feature
# However, for simple models or when interpretability isn't the main goal, keeping all dummies is fine too.
# Let's start by keeping all dummies to represent all categories.
# If the model struggles, we could experiment with drop_first=True.

print("\nConverting categorical data to numerical using one-hot encoding...")
dataset_processed = pd.get_dummies(dataset, columns=['sex', 'smoker', 'region'])

print("\nDataset after one-hot encoding:")
print(dataset_processed.head())
print("\nProcessed Dataset Info:")
print(dataset_processed.info())

# Now the categorical columns ('sex', 'smoker', 'region') have been replaced
# by new numerical (binary) columns like 'sex_female', 'sex_male', etc.
# The 'expenses' column is our target variable.


In [None]:
#Celda 3
# Data Split

# Import the necessary function from scikit-learn
from sklearn.model_selection import train_test_split

# Supongamos que el DataFrame procesado del paso anterior se llama dataset_processed

# Define the features (X) and the target (y)
# Features are all columns EXCEPT 'expenses'
# Target is the 'expenses' column
X = dataset_processed.drop('expenses', axis=1) # axis=1 means drop a column
y = dataset_processed['expenses'] # Select the 'expenses' column

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# Split the data into training and testing sets
# test_size=0.20 means 20% of the data will be used for testing
# train_size=0.80 means 80% of the data will be used for training (this is implicit if test_size is set)
# random_state=42 is used to ensure the split is the same every time you run the code.
# This makes your results reproducible. You can use any integer for random_state.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Rename the variables to match the project requirements and the final test cell
train_dataset = X_train
test_dataset = X_test
train_labels = y_train
test_labels = y_test

print("\nShape of train_dataset:", train_dataset.shape)
print("Shape of test_dataset:", test_dataset.shape)
print("Shape of train_labels:", train_labels.shape)
print("Shape of test_labels:", test_labels.shape)

# Now you have:
# train_dataset: Features for training (80% of data)
# test_dataset: Features for testing (20% of data)
# train_labels: Expenses for training (corresponding to train_dataset)
# test_labels: Expenses for testing (corresponding to test_dataset)


In [None]:
# Feature Scaling

# Import the scaler
from sklearn.preprocessing import StandardScaler

# Identify the numerical columns that need scaling
# Based on the dataset info, these are 'age', 'bmi', 'children'.
# Note: The one-hot encoded columns ('sex_', 'smoker_', 'region_') are already binary (0/1)
# and typically do NOT need scaling with StandardScaler or MinMaxScaler.
numerical_cols = ['age', 'bmi', 'children']

print(f"Scaling numerical columns: {numerical_cols}")

# Initialize the scaler
# StandardScaler standardizes features by removing the mean and scaling to unit variance.
scaler = StandardScaler()

# Fit the scaler ONLY on the training data and transform the training data
# It's CRUCIAL to fit ONLY on training data to prevent data leakage from the test set.
train_dataset[numerical_cols] = scaler.fit_transform(train_dataset[numerical_cols])

# Transform the test data using the SAME scaler fitted on the training data
# We use .transform() here, NOT .fit_transform()
test_dataset[numerical_cols] = scaler.transform(test_dataset[numerical_cols])

print("\nNumerical features scaled.")

# You can optionally inspect the scaled data
# print("Scaled train_dataset head:")
# print(train_dataset.head())
# print("\nScaled test_dataset head:")
# print(test_dataset.head())

# The train_dataset and test_dataset DataFrames now have their numerical columns scaled.
# The categorical (one-hot encoded) columns remain unchanged.


In [None]:
# Model Definition and Compilation

# Import necessary layers if not already imported (they should be from previous imports)
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense

# Define the model architecture
# We use a Sequential model, which is a linear stack of layers.
model = keras.Sequential([
    # The first layer needs to know the input shape.
    # The input shape is the number of features in our dataset (all columns except 'expenses').
    # We can get this from the shape of train_dataset.
    layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
    # Add more Dense layers to allow the model to learn complex relationships.
    # The number of units (e.g., 64, 32) and layers are hyperparameters you could tune.
    layers.Dense(64, activation='relu'),
    # The output layer for a regression model has a single unit.
    # We don't typically use an activation function on the output layer for regression,
    # or sometimes a linear activation (which is the default if no activation is specified).
    layers.Dense(1)
])

# Compile the model
# We need to specify an optimizer, a loss function, and metrics to evaluate during training.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) # Adam is a popular optimizer. learning_rate is a hyperparameter.

# Loss function: Measures how far the model's predictions are from the true labels.
# For regression, Mean Squared Error (MSE) or Mean Absolute Error (MAE) are common.
# MSE penalizes larger errors more. MAE is less sensitive to outliers and is the project's evaluation metric.
loss_function = 'mse' # Use Mean Squared Error as the primary loss for optimization

# Metrics: What we want to monitor during training and evaluation.
# The project requires evaluating based on Mean Absolute Error (MAE).
metrics_to_monitor = ['mae', 'mse'] # Monitor both MAE and MSE

model.compile(optimizer=optimizer,
              loss=loss_function,
              metrics=metrics_to_monitor)

# Print the model summary to see the layers and parameter counts
model.summary()

# The model is now defined and compiled, ready for training.


In [None]:
# Model Training

# Train the model
# We use the .fit() method to start the training process.
# train_dataset: The features for training.
# train_labels: The true values (expenses) for the training data.
# epochs: The number of times the model will iterate over the entire training dataset.
#         A higher number of epochs can lead to better learning, but also to overfitting.
#         You might need to experiment with the number of epochs. Let's start with 100.
# verbose: Controls how much output is shown during training.
#          0 = silent, 1 = progress bar, 2 = one line per epoch. Let's use 1 for progress.
# You could also add validation_split or validation_data here to monitor performance
# on a separate validation set during training, but the project primarily evaluates
# on the final test_dataset.

print("Starting model training...")

history = model.fit(
    train_dataset,
    train_labels,
    epochs=200, # You can adjust the number of epochs
    verbose=1 # Set to 0 for silent training, 2 for more detailed output
)

print("Model training finished.")

# The history object contains the training loss and metrics for each epoch.
# You could potentially use this to plot training progress later if desired,
# but the project's final evaluation uses the test set directly.


In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
