# Part 1: Feature-Level Interpretability (30 marks)  
You will use the California Housing and the Adult Census Income datasets in this part. You 
should train one feed-forward neural network for each dataset and apply the following 
interpretability techniques:

## 1. Partial Dependence Plots (PDP) and Individual Conditional Expectation (ICE) plots (7 marks) 
### a. Use PDP to examine the average effect of at least two features. 

In [None]:
# PDP for Adult census dataset


In [3]:
#California Housing Data Set

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

print(f"Using TensorFlow version: {tf.__version__}")

# --- 1. Load and Prepare the Data ---

# Load the dataset
housing = fetch_california_housing()
X = housing.data
y = housing.target

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split the data into training and testing sets
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Split the training data further into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

# --- 2. Preprocess the Data (Scaling) ---

# Neural networks perform best when numerical features are scaled
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

print(f"\nTraining features shape after scaling: {X_train.shape}")

# --- 3. Build the Feed-Forward Neural Network ---

# Get the number of input features
n_features = X_train.shape[1]

# Define the model using the Keras Sequential API
model = keras.Sequential([
    # Input layer (or specify input_shape in the first hidden layer)
    # layers.InputLayer(input_shape=[n_features]),
    
    # First hidden layer with 32 neurons and ReLU activation
    layers.Dense(32, activation="relu", input_shape=[n_features]),
    
    # Second hidden layer with 16 neurons and ReLU activation
    layers.Dense(16, activation="relu"),
    
    # Output layer with 1 neuron (for regression)
    # No activation function is specified, so it uses a linear activation (default)
    # This is what we want for a regression output
    layers.Dense(1)
])

# Display a summary of the model's architecture
model.summary()

# --- 4. Compile the Model ---

# For regression, we typically use Mean Squared Error (MSE) as the loss function
# 'adam' is a popular and effective optimizer
# We'll also monitor Mean Absolute Error (MAE) for easier interpretation
model.compile(
    loss="mean_squared_error",  # or 'mse'
    optimizer="adam",
    metrics=["mean_absolute_error"] # or 'mae'
)

# --- 5. Train the Model ---

print("\n--- Starting Model Training ---")

# We use EarlyStopping to stop training once the model stops improving
# This helps prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,         # Stop if no improvement for 10 epochs
    restore_best_weights=True  # Restore weights from the best epoch
)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stopping],
    verbose=1 # Show training progress
)

print("--- Model Training Finished ---")

# --- 6. Evaluate the Model ---

# Evaluate the model's performance on the unseen test set
print("\n--- Evaluating Model on Test Set ---")
loss, mae = model.evaluate(X_test, y_test, verbose=0)

print(f"Test Set - Loss (MSE): {loss:.4f}")
print(f"Test Set - Mean Absolute Error (MAE): {mae:.4f}")

# --- 7. Make Predictions (Optional) ---

print("\n--- Sample Predictions ---")
# Get predictions for the first 5 houses in the test set
X_new = X_test[:5]
y_pred = model.predict(X_new)

print("Predictions   |   Actual Values")
print("---------------------------------")
for i in range(len(y_pred)):
    # The target values are in hundreds of thousands of dollars
    print(f"${y_pred[i][0]*100000:,.2f}   |   ${y_test[i]*100000:,.2f}")

Using TensorFlow version: 2.20.0
Features shape: (20640, 8)
Target shape: (20640,)

Training features shape after scaling: (13209, 8)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



--- Starting Model Training ---
Epoch 1/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1.3404 - mean_absolute_error: 0.7940 - val_loss: 0.7391 - val_mean_absolute_error: 0.5458
Epoch 2/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4557 - mean_absolute_error: 0.4831 - val_loss: 0.4300 - val_mean_absolute_error: 0.4705
Epoch 3/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3970 - mean_absolute_error: 0.4467 - val_loss: 0.4997 - val_mean_absolute_error: 0.4615
Epoch 4/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.3749 - mean_absolute_error: 0.4340 - val_loss: 0.3865 - val_mean_absolute_error: 0.4429
Epoch 5/100
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.3607 - mean_absolute_error: 0.4255 - val_loss: 0.4069 - val_mean_absolute_error: 0.4441
Epoch 6/100
[1m413/413[0m [32m━━━━━━━━━

### b. Use ICE plots to explore individual predictions for at least two features. 

### c. Explain what insights PDP and ICE give about the model’s behaviour.

## 2. Permutation Feature Importance (PFI) (7 marks) 
### a. Use PFI to identify the most important features in the model. 


### b. Explain what the term “important” means when using the PFI method. 

## 3. Accumulated Local Effects (ALE) (9 marks) 
### a. Implement ALE plots to investigate the local effects of feature changes. 

### b. Compare ALE with PDP and discuss any differences in the interpretability of these techniques.

## 4. Global Surrogates (7 marks) 
### a. Build an interpretable model to approximate the predictions of the feed-forward neural network model. 

### b. Analyse the surrogate model's effectiveness and discuss when such approximations are helpful.