<a href="https://colab.research.google.com/github/kramerkraus/2155-CP3-mkraus/blob/main/diffusionmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load dataset from CSV files
data_dir = '/content/2155-CP3-mkraus/dataset'
splits = load_dataset_splits(data_dir)

# Get feature names from the CSV file
feature_names = pd.read_csv(os.path.join(data_dir, 'train_original.csv')).columns.tolist()
print(f"\n✓ Features loaded: {len(feature_names)} features")
print(f"Feature names: {feature_names[:5]}...{feature_names[-5:]}")  # Show first and last 5

Loading dataset splits from: /content/2155-CP3-mkraus/dataset

Loading train split...
  ✓ train_original.csv: (2998, 37)
  ✓ train_imputed.csv: (2998, 37)
  ✓ train_missing_mask.csv: (2998, 37)

Loading val split...
  ✓ val_original.csv: (375, 37)
  ✓ val_imputed.csv: (375, 37)
  ✓ val_missing_mask.csv: (375, 37)

Loading test split...
  ✓ test_original.csv: (375, 37)
  ✓ test_imputed.csv: (375, 37)
  ✓ test_missing_mask.csv: (375, 37)

Loading test2 split...
  ✓ test2_imputed.csv: (417, 37)
  ✓ test2_missing_mask.csv: (417, 37)

✓ Features loaded: 37 features
Feature names: ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5']...['Feature 33', 'Feature 34', 'Feature 35', 'Feature 36', 'Feature 37']


In [None]:
# Data exploration and analysis
print("\n" + "="*70)
print("DATASET ANALYSIS")
print("="*70)

# Extract data for easier access
X_train = splits['train']['imputed']
mask_train = splits['train']['missing_mask']
X_train_original = splits['train']['original']

X_val = splits['val']['imputed']
mask_val = splits['val']['missing_mask']
X_val_original = splits['val']['original']

X_test = splits['test']['imputed']
mask_test = splits['test']['missing_mask']
X_test_original = splits['test']['original']

# Test2 data (no original available for evaluation)
X_test2 = splits['test2']['imputed']
mask_test2 = splits['test2']['missing_mask']

print(f"\nData shapes:")
print(f"  - Training: {X_train.shape}")
print(f"  - Validation: {X_val.shape}")
print(f"  - Test: {X_test.shape}")
print(f"  - Test2: {X_test2.shape} (evaluation set - no ground truth)")


DATASET ANALYSIS

Data shapes:
  - Training: (2998, 37)
  - Validation: (375, 37)
  - Test: (375, 37)
  - Test2: (417, 37) (evaluation set - no ground truth)


In [None]:
# Data Preprocessing (Handle Missing Values)

print("\n" + "="*70)
print("DATA PREPROCESSING")
print("="*70)

# Handle missing values properly
print("Processing missing values and preparing data...")
print("Mask convention: True=missing, False=observed (in original masks)")

print(f"\n✓ Data preprocessing completed successfully")
print(f"  - Training data range: [{X_train_original[~mask_train].min():.3f}, {X_train_original[~mask_train].max():.3f}]")
print(f"  - Validation data range: [{X_val_original[~mask_val].min():.3f}, {X_val_original[~mask_val].max():.3f}]")
print(f"  - Test data range: [{X_test_original[~mask_test].min():.3f}, {X_test_original[~mask_test].max():.3f}]")

# Create data loaders
batch_size = 64
print(f"\nCreating data loaders with batch size: {batch_size}")

train_dataset = TensorDataset(torch.FloatTensor(X_train_original), torch.FloatTensor((~mask_train).astype(float)))
val_dataset = TensorDataset(torch.FloatTensor(X_val_original), torch.FloatTensor((~mask_val).astype(float)))
test_dataset = TensorDataset(torch.FloatTensor(X_test_original), torch.FloatTensor((~mask_test).astype(float)))
test2_dataset = TensorDataset(torch.FloatTensor(X_test2), torch.FloatTensor((~mask_test2).astype(float)))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
test2_loader = DataLoader(test2_dataset, batch_size=batch_size, shuffle=False)

# Preview a batch
sample_batch_data, sample_batch_mask = next(iter(train_loader))
print(f"\nSample batch shape: {sample_batch_data.shape}")
print(f"Sample batch mask shape: {sample_batch_mask.shape}")
print(f"Sample batch missing percentage: {(sample_batch_mask == 0).float().mean().item()*100:.1f}%")  # 0 = missing in model tensors



DATA PREPROCESSING
Processing missing values and preparing data...
Mask convention: True=missing, False=observed (in original masks)

✓ Data preprocessing completed successfully
  - Training data range: [0.000, 1.000]
  - Validation data range: [0.000, 1.000]
  - Test data range: [0.000, 1.000]

Creating data loaders with batch size: 64

Sample batch shape: torch.Size([64, 37])
Sample batch mask shape: torch.Size([64, 37])
Sample batch missing percentage: 20.3%


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


# --------------------------------------------------
# 1. Small MLP backbone (denoiser)
# --------------------------------------------------

class MLPDenoiser(nn.Module):
    def __init__(self, input_dim=37, hidden=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim + 1 + input_dim, hidden), # x + t + mask
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, input_dim),
        )

    def forward(self, x_t, t, mask):
        """
        x_t: noised data
        t: time step (batch, 1)
        mask: binary mask (1 = observed, 0 = missing)
        """
        t = t / 1000.0       # normalize time
        inp = torch.cat([x_t, t, mask], dim=1)
        return self.net(inp)


# --------------------------------------------------
# 2. Diffusion core (betas, schedules, noise)
# --------------------------------------------------

def make_beta_schedule(T=1000, start=1e-4, end=0.02):
    return torch.linspace(start, end, T)


class Diffusion(nn.Module):
    def __init__(self, input_dim, timesteps=1000):
        super().__init__()
        self.T = timesteps
        betas = make_beta_schedule(timesteps)
        alphas = 1.0 - betas
        alphas_cum = torch.cumprod(alphas, dim=0)

        self.register_buffer("betas", betas)
        self.register_buffer("alphas", alphas)
        self.register_buffer("alphas_cum", alphas_cum)

        self.model = MLPDenoiser(input_dim=input_dim)

    # -------------------------
    # q(x_t | x_0)
    # -------------------------
    def q_sample(self, x0, t, noise=None):
        if noise is None:
            noise = torch.randn_like(x0)
        a_bar = self.alphas_cum[t].unsqueeze(1)
        return torch.sqrt(a_bar) * x0 + torch.sqrt(1 - a_bar) * noise, noise

    # -------------------------
    # Training step
    # -------------------------
    def forward(self, x0, mask):
        """
        x0 : clean data (batch, D)
        mask : 1 = observed, 0 = missing
        """
        B = x0.shape[0]
        device = x0.device

        # Random time t for each sample
        t = torch.randint(0, self.T, (B,), device=device)

        # Noise forward
        xt, noise = self.q_sample(x0, t, noise=None)

        # Condition on observed values
        xt = xt * (1 - mask) + x0 * mask

        # Predict noise
        noise_pred = self.model(xt, t.unsqueeze(1).float(), mask)

        # Loss only on missing entries
        loss = ((noise_pred - noise) ** 2 * (1 - mask)).mean()
        return loss

    # -------------------------
    # Sampling / imputation
    # -------------------------
    @torch.no_grad()
    def sample(self, x_obs, mask):
        """x_obs has missing entries set to anything (will overwrite them)."""
        x = torch.randn_like(x_obs)

        for t in reversed(range(self.T)):
            bt = self.betas[t]
            at = self.alphas[t]
            a_bar = self.alphas_cum[t]

            # Conditioner: always respect observed values
            x = x * (1 - mask) + x_obs * mask

            noise_pred = self.model(x, torch.tensor([[t]], device=x.device), mask)

            # DDPM update step
            coef1 = 1 / torch.sqrt(at)
            coef2 = (1 - at) / torch.sqrt(1 - a_bar)

            x = coef1 * (x - coef2 * noise_pred)

            if t > 0:
                x += torch.sqrt(bt) * torch.randn_like(x)

        return x


In [None]:
model = Diffusion(input_dim=37, timesteps=500).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
for epoch in range(200):
    for batch, mask in train_loader:
        batch = batch.to(device)
        mask = mask.to(device)

        loss = model(batch, mask)

        opt.zero_grad()
        loss.backward()
        opt.step()

    print(f"epoch {epoch} | loss {loss.item():.4f}")

In [None]:
x_incomplete = X_test_imputed[i]      # has -1 replaced by something
mask = (X_test_missing_mask[i] == 0)  # convert your mask to 1/0 observed/missing

x_incomplete = torch.tensor(x_incomplete).float().to(device)
mask = torch.tensor(mask).float().to(device)

samples = []

for _ in range(20):
    x_gen = model.sample(x_incomplete.unsqueeze(0), mask.unsqueeze(0))
    samples.append(x_gen.cpu().numpy())

samples = np.array(samples)   # (20, 1, 37)