In [1]:
import torch
import torch.nn.functional as F
import torchmetrics

import lightning as L

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from tests import test_category_embedder, test_house_price_regressor, print_dataset_info

  from .autonotebook import tqdm as notebook_tqdm


B1. Data
---

a. Load the dataset using pandas

b. Create train, test and validation splits

c. Encode categorical features

d. Scale continuous features

e. Implement a PyTorch dataset

f. Create PyTorch dataloaders

### B1(a) Load the data using pandas

In [1]:
# data = ... TODO

# define the continuous and categorical columns
continuous_cols = [
    "dist_to_nearest_stn",
    "dist_to_dhoby",
    "degree_centrality",
    "eigenvector_centrality",
    "remaining_lease_years",
    "floor_area_sqm",
]

categorical_cols = ["month", "town", "flat_model_type", "storey_range"]

### B1(b) Temporal split with validation

Create a temporal split:

1.	Train/Val pool: all rows with year <= 2020.

2.	From this pool, randomly sample 10% of the dataset to create a validation set.

3.	Test set: all rows with year == 2021.

4.	Print shapes (n_rows, n_cols) for train_data, val_data, test_data.

In [3]:
# train_data, val_data, test_data = ... TODO

In [None]:
# DO NOT MODIFY
train_data.shape, val_data.shape, test_data.shape

### B1(c) Encode categorical features w/o leakage

1.	Create an `sklearn.preprocessing.OrdinalEncoder` with handle_unknown="use_encoded_value", unknown_value=-1.

2.	Fit on train_data[categorical_cols] only.

3.	Transform val_data and test_data using the fitted encoder.

4. Compute and print the cardinality (number of distinct codes) for each categorical column on train_data after encoding. (These will be used to define embedding tables later.)

### B1(d) Scale continuous features w/o leakage

1.	Create a `StandardScaler`.

2.	Fit on train_data[continuous_cols] only.

3.	Transform val_data and test_data.

4.	Report the mean and std of each continuous feature on the transformed train split (they should be ~0 and ~1).

Do not scale resale_price.

In [None]:
# TODO

In [None]:
# DO NOT MODIFY
train_data.shape, val_data.shape, test_data.shape

### B1(e) Implement a PyTorch Dataset

Implement HDBPriceDataset(Dataset) with:
- `__init__(self, data, categorical_cols, continuous_cols, target_col="resale_price")`

- `__getitem__(idx)` returns a dict: {"x_cat": LongTensor, "x_cont": FloatTensor, "y": FloatTensor}

In [None]:
from torch.utils.data import DataLoader, Dataset

# class HDBPriceDataset(Dataset):
#     ...

### B1(f) Create train, validation and test data loaders

Create train, validation and test datasets and data loaders. Use a batch size of 1024

In [4]:
batch_size = 1024

# train_dataset, val_dataset, test_dataset = ...

# train_dataloader, val_dataloader, test_dataloader = ...

In [None]:
# DO NOT MODIFY
print_dataset_info(train_dataset, test_dataset, train_loader, test_loader)

B2. Model and training
---
a. Create a PyTorch Model

b. Write a LightningModule for fitting the model

c. Train the model

d. Predict using the trained model on the test set and calculate $R^2$

### B2(a) Create a PyTorch Model

In this part you will build a PyTorch model for tabular house-price regression that (i) embeds each categorical feature into a dense vector, (ii) concatenates all category embeddings with the standardized continuous features, and (iii) predicts resale_price via a small MLP.

You must implement two modules:
1.	CategoryEmbedder — one embedding table per categorical column plus optional dropout.

2.	HousePriceRegressor — uses CategoryEmbedder, concatenates with continuous features, then a 3-layer MLP.

You will then instantiate the model using provided cardinalities and embedding dimensions.

Use the same feature schema and encoded/scaled splits you produced in Part 1. Assume you already have train_loader, val_loader, test_loader, continuous_cols, oe (the fitted OrdinalEncoder), and batch_size.

A. Implement CategoryEmbedder (10 pts)

Create a module that receives:
- cardinalities: List[int] — number of distinct codes per categorical column (from oe.categories_).

- embed_dims: List[int] — embedding dimension for each categorical column (same length as cardinalities).

- emb_dropout: float — dropout applied after concatenation of all embeddings.

Requirements

1.	Construct a nn.ModuleList of nn.Embedding(num_embeddings=c, embedding_dim=d) for each (c, d) pair.

2.	In forward(x_cat), where x_cat has shape [B, C] (C = #categorical columns):

	- Look up each column i using its table: emb_i(x_cat[:, i]) → [B, d_i].

	- Concatenate along feature dimension → [B, sum(embed_dims)].

	- Apply Dropout(emb_dropout) and return.

In [None]:
import torch.nn as nn

class CategoryEmbedder(nn.Module):
    def __init__(self, cardinalities, embed_dims, embed_dropout=0.0):
        """Initialize the CategoryEmbedder.
        Args:
            cardinalities (list): List of integers representing the number of unique values for each categorical feature.
            embed_dims (list): List of integers representing the embedding dimensions for each categorical feature.
            embed_dropout (float): Dropout rate for the embeddings.
        """
#    ... TODO

In [None]:
test_category_embedder(CategoryEmbedder)

In [None]:
class HousePriceRegressor(nn.Module):
    def __init__(self, cardinalities, embed_dims, n_continuous, emb_dropout=0.0):
        """Initialize the HousePriceRegressor.

        Args:
            cardinalities (list): List of integers representing the number of unique values for each categorical feature.
            embed_dims (list): List of integers representing the embedding dimensions for each categorical feature.
            n_continuous (int): Number of continuous features.
            emb_dropout (float, optional): Dropout rate for the embeddings. Defaults to 0.0.
        """

    def forward(self, x_cat, x_cont):
    #    ... TODO

In [None]:
# DO NOT MODIFY
test_house_price_regressor(HousePriceRegressor)

In [None]:
# DO NOT MODIFY
cardinalities = [len(categories) for categories in oe.categories_]
embedding_dims = [min(50, (c + 1) // 2) for c in cardinalities]
n_continuous = len(continuous_cols)

# Create model with proper cardinalities
model = HousePriceRegressor(
    cardinalities=cardinalities,
    embed_dims=embedding_dims,
    n_continuous=n_continuous,
    emb_dropout=0.1,
)

### B2(b) PyTorch LightningModule

Wrap your HousePriceRegressor into a PyTorch LightningModule so that training, validation, and testing are managed automatically.

What to do

1.	Initialize the module
    - Store hyperparameters (embedding sizes, number of continuous features, learning rate, etc.).

    - Create an instance of your HousePriceRegressor.

    - Choose a suitable regression loss function (e.g., MSE).

    - Set up at least one evaluation metric (e.g., MAE, RMSE).

2.	Forward method
    - Define a forward pass that takes categorical + continuous features and returns predictions.

3.	Training, Validation, and Test steps
    - In each step:
        - Extract features and target from the batch.

        - Run the model forward.

        - Compute the loss.

        - Log the loss and chosen metrics so they can be monitored.

4.	Prediction step
    - Implement a method to return predictions on new data without computing loss.

5.	Optimiser
    - Use a suitable optimiser (e.g., Adam) with a reasonable learning rate.

    - (Optional: add a scheduler if you wish to experiment.)

In [None]:
class HDBPriceLightningModule(L.LightningModule):
    # ... TODO

In [None]:
# DO NOT MODIFY
lightning_model = HDBPriceLightningModule(
    cardinalities=cardinalities,
    embedding_dims=embedding_dims,
    n_continuous=n_continuous,
    emb_dropout=0.1,
)

### B2(c) Train the model

What to do
1.	Checkpoints
    - Add a checkpoint callback that monitors validation loss and keeps the best models (e.g., top-k) plus the last checkpoint for resuming.

2.	Early stopping
    - Add early stopping on the same validation metric with a reasonable patience and a small min_delta (use a tiny positive value to require real improvement).

3.	Learning-rate tracking
    - Log the learning rate each epoch/step with a LR monitor so it’s visible in your logger.

4.	Experiment logging
    - Create a logger (e.g., TensorBoard) with a clear experiment name and save directory.

5.	Trainer configuration
    - Set a sensible number of epochs, devices/accelerator (CPU/GPU), and enable gradient clipping.

    - Turn on progress bar and model summary for visibility.

    - Choose an appropriate logging frequency for your dataset size.

6.	Fit & Test
- Call trainer.fit(model, train_loader, val_loader).

- Optionally run trainer.test(model, test_loader) after training and print metrics.

7. Download the plots for validation loss, your validation metric and display it.

In [6]:
max_epochs = 50
# TODO:

### B2(d) Model prediction and evaluation

1.	Run prediction
    - Use trainer.predict(model, test_loader) to generate predictions on the held-out test set.

    - Concatenate predictions into a single tensor/array.

2. Report results
    - Print R² (rounded to 4 decimal places).

    - Interpret the value of R² briefly

In [7]:
# predictions = ...


# print(f"R^2 on test set =")

B3. Model Interpretability with Integrated Gradients
---

 What to do
1.	Choose an explainability method

    - Use Integrated Gradients (IG) from captum.attr.

    - Treat the continuous features (x_cont) as the primary inputs for attribution.

    - Pass categorical features (x_cat) as additional arguments to the forward function.

2.	Define a forward wrapper

    - Implement a wrapper function so IG knows how to call your model with both categorical and continuous features.

3.	Local explanation (per sample)

    - Run IG on a single batch from the training set.

    - Use a zero baseline for continuous features.

    - Plot the continuous features ranked by signed contribution for one example.

4.	Global explanation (across dataset)

    - Loop over the validation set to compute mean absolute attributions for each continuous feature.

    - Normalize by the number of samples.

    - Rank features by importance and plot a bar chart showing the most influential features globally.

5.	Interpret results

    - Report which features matter most for the model’s predictions.

    - Reflect briefly: Do these align with intuition (e.g., distance to station, floor area)?

In [None]:
from captum.attr import IntegratedGradients

# TODO ...