In [359]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [360]:
df = pd.read_csv("insurance_data.csv")
df.head()

Unnamed: 0,age,affordibility,bought_insurance
0,22,1,0
1,25,0,0
2,47,1,1
3,52,0,0
4,46,1,1


In [361]:
df.shape

(28, 3)

# Task
Prepare the pandas DataFrame `df` for creating a TensorFlow Dataset, then convert it into a `tf.data.Dataset`, shuffle the dataset, and split it into training (70%), validation (15%), and test (15%) sets, finally verifying the splits.

## Prepare Data for TensorFlow

### Subtask:
Separate the features and the target variable from the pandas DataFrame `df` into `X` and `y` respectively, in preparation for creating a TensorFlow Dataset.


**Reasoning**:
Separate the feature columns ('age', 'affordibility') into `X` and the target column ('bought_insurance') into `y` from the DataFrame `df`.



In [362]:
X = df[['age', 'affordibility']]
y = df['bought_insurance']

X['age'] = X['age'] / 100

print("Features (X):")
print(X.head())
print("\nTarget (y):")
print(y.head())


Features (X):
    age  affordibility
0  0.22              1
1  0.25              0
2  0.47              1
3  0.52              0
4  0.46              1

Target (y):
0    0
1    0
2    1
3    0
4    1
Name: bought_insurance, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age'] = X['age'] / 100


## Create TensorFlow Dataset

### Subtask:
Convert the prepared features (`X`) and target (`y`) into a `tf.data.Dataset` using `tf.data.Dataset.from_tensor_slices`.


**Reasoning**:
To create a TensorFlow Dataset, I will first convert the pandas DataFrame `X` and Series `y` into NumPy arrays, then use `tf.data.Dataset.from_tensor_slices` to create the dataset, and finally print its `element_spec` for verification.



In [363]:
X_numpy = X.values
y_numpy = y.values

dataset = tf.data.Dataset.from_tensor_slices((X_numpy, y_numpy))

print("TensorFlow Dataset created. Element spec:")
print(dataset.element_spec)

TensorFlow Dataset created. Element spec:
(TensorSpec(shape=(2,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


## Shuffle and Split Dataset

### Subtask:
Determine the total number of samples in the dataset and calculate the sizes for the training (70%), validation (15%), and test (15%) sets. Then, shuffle the entire dataset and split it into `train_ds`, `val_ds`, and `test_ds` using appropriate TensorFlow dataset methods like `skip` and `take`.


**Reasoning**:
First, I'll calculate the total number of samples from the `X_numpy` array and then determine the sizes for the training, validation, and test sets based on the specified percentages. Then, I will shuffle the entire dataset and split it into the respective training, validation, and test datasets using `take` and `skip` methods.



In [364]:
total_samples = X_numpy.shape[0]
train_size = int(0.7 * total_samples)
val_size = int(0.15 * total_samples)
test_size = total_samples - train_size - val_size # Ensure all samples are accounted for

print(f"Total samples: {total_samples}")
print(f"Training samples: {train_size}")
print(f"Validation samples: {val_size}")
print(f"Test samples: {test_size}")

shuffled_dataset = dataset.shuffle(buffer_size=total_samples, seed=42)

train_ds = shuffled_dataset.take(train_size)
val_ds = shuffled_dataset.skip(train_size).take(val_size)
test_ds = shuffled_dataset.skip(train_size + val_size).take(test_size)

print(f"\nTrain dataset size: {tf.data.experimental.cardinality(train_ds).numpy()}")
print(f"Validation dataset size: {tf.data.experimental.cardinality(val_ds).numpy()}")
print(f"Test dataset size: {tf.data.experimental.cardinality(test_ds).numpy()}")

Total samples: 28
Training samples: 19
Validation samples: 4
Test samples: 5

Train dataset size: 19
Validation dataset size: 4
Test dataset size: 5


## Verify Dataset Splits

### Subtask:
Print the number of elements or inspect a few samples from `train_ds`, `val_ds`, and `test_ds` to confirm that the splitting and shuffling were performed correctly.


**Reasoning**:
To verify the dataset splits, I will iterate through a few samples from each dataset (train, validation, and test) and print their features and labels. Additionally, I will reconfirm the number of elements in each dataset by printing their cardinalities.



In [365]:
print("\n--- Inspecting Training Dataset Samples ---")
for i, (features, labels) in enumerate(train_ds.take(1)):
    print(f"Sample {i+1} - Features: {features.shape}, Labels: {labels.shape}")

print("\n--- Inspecting Validation Dataset Samples ---")
for i, (features, labels) in enumerate(val_ds.take(1)):
    print(f"Sample {i+1} - Features: {features.shape}, Labels: {labels.shape}")

print("\n--- Inspecting Test Dataset Samples ---")
for i, (features, labels) in enumerate(test_ds.take(1)):
    print(f"Sample {i+1} - Features: {features.shape}, Labels: {labels.shape}")

print("\n--- Reconfirming Dataset Sizes ---")
print(f"Train dataset size: {tf.data.experimental.cardinality(train_ds).numpy()}")
print(f"Validation dataset size: {tf.data.experimental.cardinality(val_ds).numpy()}")
print(f"Test dataset size: {tf.data.experimental.cardinality(test_ds).numpy()}")


--- Inspecting Training Dataset Samples ---
Sample 1 - Features: (2,), Labels: ()

--- Inspecting Validation Dataset Samples ---
Sample 1 - Features: (2,), Labels: ()

--- Inspecting Test Dataset Samples ---
Sample 1 - Features: (2,), Labels: ()

--- Reconfirming Dataset Sizes ---
Train dataset size: 19
Validation dataset size: 4
Test dataset size: 5


In [366]:
train_ds.element_spec

(TensorSpec(shape=(2,), dtype=tf.float64, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [367]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds_batched = (
  train_ds
  .batch(train_size)
  .cache()
  .prefetch(AUTOTUNE)
)

val_ds_batched = (
  val_ds
  .batch(train_size)
  .cache()
  .prefetch(AUTOTUNE)
)

test_ds_batched = (
  test_ds
  .batch(train_size)
  .cache()
  .prefetch(AUTOTUNE)
)


# Full-batch datasets (one batch per epoch)
train_ds = train_ds.batch(train_size).cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(val_size).cache().prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(test_size).cache().prefetch(tf.data.AUTOTUNE)

for feat, label in train_ds.take(1):
  print(feat.shape, label)

for feat, label in train_ds_batched.take(2):
  print(feat.shape, label)

(19, 2) tf.Tensor([1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 0], shape=(19,), dtype=int64)
(19, 2) tf.Tensor([1 0 0 0 0 1 1 0 1 1 1 1 0 0 0 0 0 0 0], shape=(19,), dtype=int64)


In [368]:
def get_accuracy_loss(history):
  # Get the final training loss and accuracy
  final_train_loss = history.history['loss'][-1]
  final_train_accuracy = history.history['accuracy'][-1]

  # Get the final validation loss and accuracy (if validation data was provided)
  if 'val_loss' in history.history:
      final_val_loss = history.history['val_loss'][-1]
      final_val_accuracy = history.history['val_accuracy'][-1]

      print(f"\nFinal Training Loss: {final_train_loss:.4f}")
      print(f"Final Training Accuracy: {final_train_accuracy*100:.2f}%")
      print(f"Final Validation Loss: {final_val_loss:.4f}")
      print(f"Final Validation Accuracy: {final_val_accuracy*100:.2f}%")
  else:
      print(f"\nFinal Training Loss: {final_train_loss:.4f}")
      print(f"Final Training Accuracy: {final_train_accuracy*100:.2f}%")
      print("No validation data provided.")

In [369]:
mlp_model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(2,)),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
mlp_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(mlp_model.summary())

None


In [370]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=20,
    restore_best_weights=True
)
history = mlp_model.fit(
    train_ds_batched,
    validation_data=val_ds_batched,
    epochs=800,
    verbose=0
    # callbacks=[early_stop]
)
get_accuracy_loss(history)


Final Training Loss: 0.6303
Final Training Accuracy: 63.16%
Final Validation Loss: 0.7931
Final Validation Accuracy: 0.00%


In [371]:
import numpy as np
import tensorflow as tf

# Seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

df = pd.read_csv("insurance_data.csv")

# Scale like the NumPy notebook
X = df[['age', 'affordibility']].copy()
X['age'] = X['age'] / 100
y = df['bought_insurance']

X_np = X.values.astype('float32')
y_np = y.values.astype('float32')

dataset = tf.data.Dataset.from_tensor_slices((X_np, y_np))

total = X_np.shape[0]
train_size = int(0.7 * total)
val_size = int(0.15 * total)
test_size = total - train_size - val_size

# One-time shuffle, then split
ds_shuffled = dataset.shuffle(buffer_size=total, seed=42, reshuffle_each_iteration=False)

train_ds = ds_shuffled.take(train_size)
val_ds = ds_shuffled.skip(train_size).take(val_size)
test_ds = ds_shuffled.skip(train_size + val_size).take(test_size)

# Full-batch datasets (one batch per epoch)
train_ds = train_ds.batch(train_size).cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(val_size).cache().prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(test_size).cache().prefetch(tf.data.AUTOTUNE)

for feat, label in train_ds.take(1):
  print(feat.shape, label)

# Model with proper input shape
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(2,)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=800,
    verbose=0
)

get_accuracy_loss(history)

# test_loss, test_acc = model.evaluate(test_ds, verbose=0)
# print(f"Test loss: {test_loss:.4f}, test acc: {test_acc:.4f}")

(19, 2) tf.Tensor([0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.], shape=(19,), dtype=float32)

Final Training Loss: 0.6149
Final Training Accuracy: 68.42%
Final Validation Loss: 0.8892
Final Validation Accuracy: 0.00%
