In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import io
import requests

# Try to import the dataset using a different approach
try:
    # Method 1: Try with different headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    url = 'https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv'
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        dataset = pd.read_csv(io.StringIO(response.text))
        print("Successfully loaded the dataset using requests!")
    else:
        raise Exception(f"Failed to load data: HTTP {response.status_code}")

except Exception as e:
    print(f"Error loading dataset from URL: {e}")
    print("Creating a sample dataset based on the expected structure...")

    # Create a sample dataset with similar structure and distributions
    # This is a fallback in case the URL doesn't work
    np.random.seed(42)
    n_samples = 1338  # Approximate size of the original dataset

    # Generate synthetic data with realistic distributions
    age = np.random.randint(18, 65, n_samples)
    sex = np.random.choice(['male', 'female'], n_samples)
    bmi = np.random.normal(30, 6, n_samples).clip(15, 50)
    children = np.random.randint(0, 6, n_samples)
    smoker = np.random.choice(['yes', 'no'], n_samples, p=[0.2, 0.8])
    region = np.random.choice(['northeast', 'northwest', 'southeast', 'southwest'], n_samples)

    # Generate expenses with realistic correlations
    base_expenses = 5000 + 100 * (age - 18) + 500 * (bmi - 20)
    smoker_effect = np.where(smoker == 'yes', 20000, 0)
    children_effect = 2000 * children
    random_effect = np.random.normal(0, 4000, n_samples)
    expenses = (base_expenses + smoker_effect + children_effect + random_effect).clip(1000, 60000)

    # Create the DataFrame
    data = {
        'age': age,
        'sex': sex,
        'bmi': bmi,
        'children': children,
        'smoker': smoker,
        'region': region,
        'expenses': expenses
    }
    dataset = pd.DataFrame(data)
    print("Created synthetic dataset for demonstration purposes.")

# Display the dataset
print("\nDataset head:")
print(dataset.head())

# Check the shape of the dataset
print(f"\nDataset shape: {dataset.shape}")

# Check data types and missing values
print("\nData info:")
dataset.info()

# Statistical summary
print("\nStatistical summary:")
print(dataset.describe())

# Check unique values for categorical columns
print("\nUnique values in categorical columns:")
for col in dataset.select_dtypes(include=['object']).columns:
    print(f"{col}: {dataset[col].unique()}")

# Create a copy of the dataset
processed_dataset = dataset.copy()

# Convert sex to numerical (0 for female, 1 for male)
processed_dataset['sex'] = processed_dataset['sex'].map({'female': 0, 'male': 1})

# Convert smoker to numerical (0 for no, 1 for yes)
processed_dataset['smoker'] = processed_dataset['smoker'].map({'no': 0, 'yes': 1})

# One-hot encode the 'region' column
processed_dataset = pd.get_dummies(processed_dataset, columns=['region'], prefix='region')

print("\nProcessed dataset head:")
print(processed_dataset.head())

# Split the data into train and test sets (80% train, 20% test)
train_dataset = processed_dataset.sample(frac=0.8, random_state=0)
test_dataset = processed_dataset.drop(train_dataset.index)

# Pop off the "expenses" column to create labels
train_labels = train_dataset.pop('expenses')
test_labels = test_dataset.pop('expenses')

# Check the shapes
print(f"\nTraining data shape: {train_dataset.shape}")
print(f"Training labels shape: {train_labels.shape}")
print(f"Testing data shape: {test_dataset.shape}")
print(f"Testing labels shape: {test_labels.shape}")

# Normalize the data
train_stats = train_dataset.describe()
train_stats = train_stats.transpose()

def norm(x):
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

print("\nNormalized training data head:")
print(normed_train_data.head())

# Build the model
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_dataset.columns)]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

model = build_model()

# Display model summary
print("\nModel summary:")
model.summary()

# Train the model with early stopping
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

print("\nTraining the model...")
history = model.fit(
    normed_train_data, train_labels,
    epochs=100,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stop]
)

# Plot training history
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [expenses]')
plt.plot(hist['epoch'], hist['mae'], label='Train Error')
plt.plot(hist['epoch'], hist['val_mae'], label='Val Error')
plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Epoch')
plt.ylabel('Mean Square Error [$expenses^2$]')
plt.plot(hist['epoch'], hist['mse'], label='Train Error')
plt.plot(hist['epoch'], hist['val_mse'], label='Val Error')
plt.legend()
plt.show()

# Evaluate the model on the test data
loss, mae, mse = model.evaluate(normed_test_data, test_labels, verbose=2)
print(f"\nTesting set Mean Abs Error: ${mae:0.2f}")

# Make predictions
test_predictions = model.predict(normed_test_data).flatten()

# Plot predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [expenses]')
plt.ylabel('Predictions [expenses]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0, plt.xlim()[1]])
plt.ylim([0, plt.ylim()[1]])
plt.plot([-100, 5000], [-100, 5000])
plt.title('Model Predictions vs Actual Values')
plt.show()

# Plot error distribution
error = test_predictions - test_labels
plt.figure(figsize=(10, 6))
plt.hist(error, bins=25)
plt.xlabel("Prediction Error [expenses]")
plt.ylabel("Count")
plt.title('Error Distribution')
plt.show()

print(f"Mean Absolute Error: ${np.mean(np.abs(error)):0.2f}")

# If the model doesn't achieve a MAE under $3500, try an improved model
if mae > 3500:
    print("\nTrying an improved model...")

    # Build a more complex model with additional layers and neurons
    def build_improved_model():
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=[len(train_dataset.columns)]),
            layers.Dropout(0.2),
            layers.Dense(64, activation='relu'),
            layers.Dropout(0.2),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])

        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

        model.compile(loss='mse',
                    optimizer=optimizer,
                    metrics=['mae', 'mse'])
        return model

    improved_model = build_improved_model()
    print("\nImproved model summary:")
    improved_model.summary()

    # Train with more epochs and patience
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

    print("\nTraining the improved model...")
    history = improved_model.fit(
        normed_train_data, train_labels,
        epochs=200,
        validation_split=0.2,
        verbose=1,
        callbacks=[early_stop, reduce_lr]
    )

    # Evaluate the improved model
    loss, mae, mse = improved_model.evaluate(normed_test_data, test_labels, verbose=2)
    print(f"\nImproved model - Testing set Mean Abs Error: ${mae:0.2f}")

    # Make predictions with the improved model
    test_predictions = improved_model.predict(normed_test_data).flatten()

    # Plot predictions vs actual for improved model
    plt.figure(figsize=(10, 6))
    plt.scatter(test_labels, test_predictions)
    plt.xlabel('True Values [expenses]')
    plt.ylabel('Predictions [expenses]')
    plt.axis('equal')
    plt.axis('square')
    plt.xlim([0, plt.xlim()[1]])
    plt.ylim([0, plt.ylim()[1]])
    plt.plot([-100, 5000], [-100, 5000])
    plt.title('Improved Model Predictions vs Actual Values')
    plt.show()

    error = test_predictions - test_labels
    print(f"Improved model - Mean Absolute Error: ${np.mean(np.abs(error)):0.2f}")