In [1]:
import sys
from pathlib import Path
import os
import random
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from torch.utils.data import DataLoader, Dataset, Subset
from torchvision import transforms


In [2]:
# Project root
CURRENT_DIR = Path.cwd().resolve()
PROJECT_ROOT = CURRENT_DIR if (CURRENT_DIR / "src").exists() else CURRENT_DIR.parent
os.chdir(PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)


Project root: D:\ML_PROJECTS\real-estate-multimodal


In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


In [4]:
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])
TABULAR_FEATURES = [
    "bedrooms","bathrooms","sqft_living","sqft_lot","floors",
    "waterfront","view","condition","grade","sqft_above",
    "sqft_basement","lat","long","sqft_living15","sqft_lot15",
    "sale_year","sale_month","basement_ratio","living_lot_ratio",
    "living_vs_neighbors","house_age","is_renovated"
]



In [5]:
from src.dataset import RealEstateDataset

train_dataset = RealEstateDataset(
    csv_path=PROJECT_ROOT / "data/train_processed.csv",
    images_dir=PROJECT_ROOT / "data/images/train",
    tabular_features=TABULAR_FEATURES,
    target_col="price_log",
    transform=image_transforms
)


In [6]:
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(
    np.arange(len(train_dataset)),
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_ds = Subset(train_dataset, train_idx)
val_ds   = Subset(train_dataset, val_idx)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=16, shuffle=False)


In [7]:
from src.model import MultimodalRegressor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalRegressor(tabular_dim=len(TABULAR_FEATURES)).to(device)


In [8]:
class ImageOnlyDataset(Dataset):
    def __init__(self, multimodal_dataset):
        self.base = multimodal_dataset

    def __len__(self):
        return len(self.base)

    def __getitem__(self, idx):
        image, _, target = self.base[idx]
        return image, target


In [9]:
class ImageOnlyRegressor(nn.Module):
    def __init__(self, image_encoder, image_feat_dim):
        super().__init__()
        self.image_encoder = image_encoder
        self.head = nn.Sequential(
            nn.Linear(image_feat_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, image):
        feat = self.image_encoder(image)
        return self.head(feat).squeeze(1)


In [10]:
image_only_dataset = ImageOnlyDataset(train_dataset)

train_img_ds = Subset(image_only_dataset, train_idx)
val_img_ds   = Subset(image_only_dataset, val_idx)

train_img_loader = DataLoader(train_img_ds, batch_size=16, shuffle=True)
val_img_loader   = DataLoader(val_img_ds, batch_size=16, shuffle=False)


In [11]:
image_encoder_out_dim = model.image_encoder.projection[0].out_features

image_only_model = ImageOnlyRegressor(
    model.image_encoder,
    image_feat_dim=image_encoder_out_dim
).to(device)


In [12]:
criterion = nn.SmoothL1Loss(beta=0.5)
optimizer = torch.optim.AdamW(
    image_only_model.parameters(),
    lr=3e-5,
    weight_decay=1e-4
)

for epoch in range(8):
    image_only_model.train()
    train_loss = 0.0

    for images, targets in train_img_loader:
        images = images.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        preds = image_only_model(images)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    train_loss /= len(train_img_loader.dataset)

    image_only_model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, targets in val_img_loader:
            images = images.to(device)
            targets = targets.to(device)
            preds = image_only_model(images)
            loss = criterion(preds, targets)
            val_loss += loss.item() * images.size(0)

    val_loss /= len(val_img_loader.dataset)

    print(
        f"[Image-only] Epoch {epoch+1} | "
        f"Train RMSE: {np.sqrt(train_loss):.3f} | "
        f"Val RMSE: {np.sqrt(val_loss):.3f}"
    )

[Image-only] Epoch 1 | Train RMSE: 2.195 | Val RMSE: 1.442
[Image-only] Epoch 2 | Train RMSE: 0.610 | Val RMSE: 1.351
[Image-only] Epoch 3 | Train RMSE: 0.581 | Val RMSE: 1.345
[Image-only] Epoch 4 | Train RMSE: 0.559 | Val RMSE: 1.323
[Image-only] Epoch 5 | Train RMSE: 0.538 | Val RMSE: 1.278
[Image-only] Epoch 6 | Train RMSE: 0.520 | Val RMSE: 1.326
[Image-only] Epoch 7 | Train RMSE: 0.508 | Val RMSE: 1.186
[Image-only] Epoch 8 | Train RMSE: 0.495 | Val RMSE: 1.267


In [16]:
from src.model import MultimodalRegressor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalRegressor(tabular_dim=len(TABULAR_FEATURES)).to(device)

checkpoint = torch.load("multimodal_best.pth", map_location=device)


In [17]:
# Extract only ResNet backbone weights
backbone_state = {
    k.replace("image_encoder.backbone.", ""): v
    for k, v in checkpoint.items()
    if k.startswith("image_encoder.backbone.")
}

# Load ONLY into backbone
model.image_encoder.backbone.load_state_dict(backbone_state)
model.eval()


MultimodalRegressor(
  (image_encoder): ImageEncoder(
    (backbone): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  

In [18]:
# quick check: forward pass should work
image, tab, _ = val_ds[0]
with torch.no_grad():
    _ = model(
        image.unsqueeze(0).to(device),
        tab.unsqueeze(0).to(device)
    )

print("Forward pass OK")


Forward pass OK


In [19]:
target_layer = model.image_encoder.backbone.layer4


In [20]:
class GradCAM:
    def __init__(self, model, target_layer):
        self.model = model
        self.target_layer = target_layer
        self.gradients = None
        self.activations = None

        target_layer.register_forward_hook(self._save_activations)
        target_layer.register_backward_hook(self._save_gradients)

    def _save_activations(self, module, input, output):
        self.activations = output.detach()

    def _save_gradients(self, module, grad_input, grad_output):
        self.gradients = grad_output[0].detach()

    def generate(self, image, tabular):
        self.model.zero_grad()
        output = self.model(image, tabular)
        output.backward(torch.ones_like(output))

        weights = self.gradients.mean(dim=(2, 3), keepdim=True)
        cam = (weights * self.activations).sum(dim=1)
        cam = torch.relu(cam)

        cam -= cam.min()
        cam /= cam.max() + 1e-8

        return cam


In [21]:
from pathlib import Path
Path("outputs/gradcam").mkdir(parents=True, exist_ok=True)


In [None]:
import cv2
import matplotlib.pyplot as plt
import random

gradcam = GradCAM(model, target_layer)

sample_indices = random.sample(range(len(val_ds)), 8)

for idx in sample_indices:
    image, tabular, target = val_ds[idx]

    image_input = image.unsqueeze(0).to(device)
    tab_input   = tabular.unsqueeze(0).to(device)

    cam = gradcam.generate(image_input, tab_input)
    cam = cam.squeeze().cpu().numpy()

    img_np = image.permute(1, 2, 0).numpy()
    img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min() + 1e-8)

    cam_resized = cv2.resize(
        cam,
        (img_np.shape[1], img_np.shape[0]),
        interpolation=cv2.INTER_CUBIC
    )

    heatmap = cv2.applyColorMap(
        np.uint8(255 * cam_resized),
        cv2.COLORMAP_JET
    )
    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
    heatmap = heatmap / 255.0

    overlay = 0.6 * img_np + 0.4 * heatmap
    overlay = np.clip(overlay, 0, 1)

    plt.figure(figsize=(5, 5))
    plt.imshow(overlay)
    plt.axis("off")
    plt.title("Grad-CAM Overlay")

    plt.savefig(f"outputs/gradcam/sample_{idx}.png", bbox_inches="tight")
    plt.close()


  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


ValueError: operands could not be broadcast together with shapes (224,224,3) (7,7,3) 