In [1]:
import torch
import torch.nn as nn
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ClippedAdam
import numpy as np
import pandas as pd
import glob, os
from sklearn.metrics import mean_squared_error

In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# --- Load and prepare data ---
def load_data(data_dir, seq_len=18):
    paths = sorted(
        glob.glob(os.path.join(data_dir, "IPF_Final_*.csv")),
        key=lambda p: int(os.path.basename(p).split('_')[2].split('.')[0])
    )
    assert len(paths) >= seq_len + 1, f"Need at least {seq_len + 1} CSV files for full sequence + target."

    arrays = [np.loadtxt(path, delimiter=',', skiprows=1, usecols=(1, 2, 3)) for path in paths[:seq_len + 1]]
    X_seq = np.stack(arrays[:seq_len], axis=0)
    Y_arr = arrays[seq_len]

    x_tensor = torch.tensor(X_seq.transpose(1, 0, 2), dtype=torch.float32)
    y_tensor = torch.tensor(Y_arr, dtype=torch.float32)
    return x_tensor, y_tensor

data_dir = "/kaggle/input/traincs772"
seq_len = 18
x_full, y_full = load_data(data_dir, seq_len)
x_full = x_full.to(device)
y_full = y_full.to(device)


In [4]:
class MultiLSTMBackbone(nn.Module):
    def __init__(self, input_size=3, hidden_size=64):
        super().__init__()
        self.lstm_phi1 = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.lstm_phi = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.lstm_phi2 = nn.LSTM(input_size, hidden_size, batch_first=True)

        self.fc_phi1 = nn.Linear(hidden_size * 3, 1)
        self.fc_phi = nn.Linear(hidden_size * 3, 1)
        self.fc_phi2 = nn.Linear(hidden_size * 3, 1)

    def forward(self, x):
        _, (h1, _) = self.lstm_phi1(x)
        _, (h2, _) = self.lstm_phi(x)
        _, (h3, _) = self.lstm_phi2(x)
        h_cat = torch.cat([h1[-1], h2[-1], h3[-1]], dim=1)
        out1 = self.fc_phi1(h_cat)
        out2 = self.fc_phi(h_cat)
        out3 = self.fc_phi2(h_cat)
        return out1, out2, out3

In [5]:
def model_fn(x, y=None):
    y1, y2, y3 = (y[:, 0:1], y[:, 1:2], y[:, 2:3]) if y is not None else (None, None, None)

    _, (h1, _) = model.lstm_phi1(x)
    _, (h2, _) = model.lstm_phi(x)
    _, (h3, _) = model.lstm_phi2(x)
    h_combined = torch.cat([h1[-1], h2[-1], h3[-1]], dim=1)

    w1 = pyro.sample("w1", dist.Normal(model.fc_phi1.weight, 0.5).to_event(2))
    b1 = pyro.sample("b1", dist.Normal(model.fc_phi1.bias, 0.5).to_event(1))
    w2 = pyro.sample("w2", dist.Normal(model.fc_phi.weight, 0.5).to_event(2))
    b2 = pyro.sample("b2", dist.Normal(model.fc_phi.bias, 0.5).to_event(1))
    w3 = pyro.sample("w3", dist.Normal(model.fc_phi2.weight, 0.5).to_event(2))
    b3 = pyro.sample("b3", dist.Normal(model.fc_phi2.bias, 0.5).to_event(1))

    mean1 = h_combined @ w1.t() + b1
    mean2 = h_combined @ w2.t() + b2
    mean3 = h_combined @ w3.t() + b3

    with pyro.plate("data", x.shape[0]):
        pyro.sample("obs1", dist.Normal(mean1, 5.0).to_event(1), obs=y1)
        pyro.sample("obs2", dist.Normal(mean2, 5.0).to_event(1), obs=y2)
        pyro.sample("obs3", dist.Normal(mean3, 5.0).to_event(1), obs=y3)

In [6]:
def guide_fn(x, y=None):
    for i, head in enumerate([model.fc_phi1, model.fc_phi, model.fc_phi2], start=1):
        pyro.param(f"w{i}_loc", head.weight.detach().clone())
        pyro.param(f"w{i}_scale", torch.ones_like(head.weight) * 0.5, constraint=dist.constraints.positive)
        pyro.param(f"b{i}_loc", head.bias.detach().clone())
        pyro.param(f"b{i}_scale", torch.ones_like(head.bias) * 0.5, constraint=dist.constraints.positive)

        pyro.sample(f"w{i}", dist.Normal(pyro.param(f"w{i}_loc"), pyro.param(f"w{i}_scale")).to_event(2))
        pyro.sample(f"b{i}", dist.Normal(pyro.param(f"b{i}_loc"), pyro.param(f"b{i}_scale")).to_event(1))


# Instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLSTMBackbone().to(device)

In [7]:
# --- LSTM Backbone Pretraining ---
print("🔧 Pretraining deterministic LSTM backbone...")
backbone_optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

model.train()
for epoch in range(2000):
    backbone_optimizer.zero_grad()
    out1, out2, out3 = model(x_full)
    loss = (
        0.33 * loss_fn(out1, y_full[:, 0:1]) +
        0.33 * loss_fn(out2, y_full[:, 1:2]) +
        0.34 * loss_fn(out3, y_full[:, 2:3])
    )
    loss.backward()
    backbone_optimizer.step()
    if (epoch + 1) % 100 == 0:
        print(f"Epoch {epoch + 1} LSTM Backbone Loss: {loss:.2f}")

🔧 Pretraining deterministic LSTM backbone...
Epoch 100 LSTM Backbone Loss: 6546.93
Epoch 200 LSTM Backbone Loss: 4735.44
Epoch 300 LSTM Backbone Loss: 3853.09
Epoch 400 LSTM Backbone Loss: 3347.92
Epoch 500 LSTM Backbone Loss: 3042.70
Epoch 600 LSTM Backbone Loss: 2830.90
Epoch 700 LSTM Backbone Loss: 2670.02
Epoch 800 LSTM Backbone Loss: 2536.49
Epoch 900 LSTM Backbone Loss: 2409.76
Epoch 1000 LSTM Backbone Loss: 2280.72
Epoch 1100 LSTM Backbone Loss: 2163.22
Epoch 1200 LSTM Backbone Loss: 2050.39
Epoch 1300 LSTM Backbone Loss: 1939.17
Epoch 1400 LSTM Backbone Loss: 1832.63
Epoch 1500 LSTM Backbone Loss: 1727.07
Epoch 1600 LSTM Backbone Loss: 1596.29
Epoch 1700 LSTM Backbone Loss: 1488.92
Epoch 1800 LSTM Backbone Loss: 1380.52
Epoch 1900 LSTM Backbone Loss: 1277.77
Epoch 2000 LSTM Backbone Loss: 1185.04


In [17]:
# import math

# # --- Manual SGLD for the three linear heads ---

# # 1) Freeze your backbone
# for p in model.parameters():
#     p.requires_grad = False
model.eval()

# 2) Precompute the combined hidden representation for all data
with torch.no_grad():
    _, (h1, _) = model.lstm_phi1(x_full)
    _, (h2, _) = model.lstm_phi(x_full)
    _, (h3, _) = model.lstm_phi2(x_full)
    h_combined = torch.cat([h1[-1], h2[-1], h3[-1]], dim=1)  # shape (N, hidden*3)
    y_true = y_full  # shape (N, 3)

# 3) Initialize SGLD parameters from your pretrained heads
w1 = model.fc_phi1.weight.detach().clone().requires_grad_(True)
b1 = model.fc_phi1.bias.detach().clone().requires_grad_(True)
w2 = model.fc_phi.weight.detach().clone().requires_grad_(True)
b2 = model.fc_phi.bias.detach().clone().requires_grad_(True)
w3 = model.fc_phi2.weight.detach().clone().requires_grad_(True)
b3 = model.fc_phi2.bias.detach().clone().requires_grad_(True)

# 4) SGLD hyperparameters
lr = 1e-5            # step‐size (try between 1e-6 and 1e-4)
sigma_lik = 5.0      # your observation noise σ in the Normal likelihood
tau0 = 0.5           # prior std dev for weights/biases
n_iters = 5000
burn_in = 1000
thin = 10

# 5) Storage for posterior samples
samples = {'w1':[], 'b1':[], 'w2':[], 'b2':[], 'w3':[], 'b3':[]}

# 6) Run SGLD
for it in range(n_iters):
    # a) Compute negative log-posterior (up to constant)
    phi1 = h_combined @ w1.t() + b1     # (N,1)
    phi2 = h_combined @ w2.t() + b2
    phi3 = h_combined @ w3.t() + b3

    mse_term = ((y_true[:,0:1] - phi1)**2).sum() \
             + ((y_true[:,1:2] - phi2)**2).sum() \
             + ((y_true[:,2:3] - phi3)**2).sum()
    neg_log_like = mse_term / (2 * sigma_lik**2)

    prior_wb = (w1**2).sum() + (b1**2).sum() \
             + (w2**2).sum() + (b2**2).sum() \
             + (w3**2).sum() + (b3**2).sum()
    neg_log_prior = prior_wb / (2 * tau0**2)

    loss = neg_log_like + neg_log_prior
    loss.backward()

    # b) SGLD update: θ ← θ − η ∇θ + √(2η) ξ
    with torch.no_grad():
        for name, param in [('w1', w1), ('b1', b1),
                            ('w2', w2), ('b2', b2),
                            ('w3', w3), ('b3', b3)]:
            grad = param.grad
            noise = torch.randn_like(param) * math.sqrt(2 * lr)
            param -= lr * grad
            param += noise
            param.grad.zero_()

    # c) Collect after burn-in & thinning
    if it >= burn_in and (it - burn_in) % thin == 0:
        samples['w1'].append(w1.clone())
        samples['b1'].append(b1.clone())
        samples['w2'].append(w2.clone())
        samples['b2'].append(b2.clone())
        samples['w3'].append(w3.clone())
        samples['b3'].append(b3.clone())

# 7) Compute posterior means
w1_post = torch.stack(samples['w1'], dim=0).mean(0)
b1_post = torch.stack(samples['b1'], dim=0).mean(0)
w2_post = torch.stack(samples['w2'], dim=0).mean(0)
b2_post = torch.stack(samples['b2'], dim=0).mean(0)
w3_post = torch.stack(samples['w3'], dim=0).mean(0)
b3_post = torch.stack(samples['b3'], dim=0).mean(0)

# --- Inference with these posterior means ---
with torch.no_grad():
    phi1 = h_combined @ w1_post.t() + b1_post
    phi2 = h_combined @ w2_post.t() + b2_post
    phi3 = h_combined @ w3_post.t() + b3_post
    preds = torch.cat([phi1, phi2, phi3], dim=1).cpu().numpy()

rmse = np.sqrt(np.mean((preds - y_true.cpu().numpy())**2))
print(f"✅ Final Manual-SGLD RMSE: {rmse:.4f} degrees")

# 8) Save as CSV
df = pd.DataFrame(preds, columns=["Phi1","Phi","Phi2"])
df.insert(0, "Phase", 1)
df.to_csv("IPF_20_pred_manual_sgld.csv", index=False)
print("✅ Manual SGLD predictions saved to IPF_20_pred_manual_sgld.csv")


✅ Final Manual-SGLD RMSE: 14.7996 degrees
✅ Manual SGLD predictions saved to IPF_20_pred_manual_sgld.csv


In [20]:
import math
import os
import pandas as pd
import torch

# Ensure model, x_full, y_full are already defined and on the correct device
# model: your MultiLSTMBackbone instance
# x_full: input tensor of shape (N, seq_len, features)
# y_full: target tensor of shape (N, 3)

# --- Fully Stochastic SGLD over the entire network ---
# Hyperparameters
lr        = 1e-5    # SGLD step size
sigma_lik = 5.0     # observation noise std dev
tau0      = 0.5     # prior std dev on all weights
n_iters   = 5000    # total SGLD iterations
burn_in   = 1000    # burn-in period
thin      = 10      # thinning interval

# 1) Collect all trainable parameters
all_params  = list(model.parameters())
param_names = [f"p{i}" for i in range(len(all_params))]
# buffer to store samples
samples     = {name: [] for name in param_names}

# 2) Run SGLD
for it in range(n_iters):
    # a) Full forward pass
    out1, out2, out3 = model(x_full)                   # each shape (N,1)
    y_pred = torch.cat([out1, out2, out3], dim=1)      # shape (N,3)

    # b) Negative log-likelihood term
    mse = ((y_full - y_pred)**2).sum()
    neg_log_like = mse / (2 * sigma_lik**2)

    # c) Negative log-prior term (Gaussian prior on all params)
    prior_norm2 = sum((p**2).sum() for p in all_params)
    neg_log_prior = prior_norm2 / (2 * tau0**2)

    # d) Total loss = negative log-posterior
    loss = neg_log_like + neg_log_prior
    loss.backward()

    # e) SGLD update step: drift + diffusion
    with torch.no_grad():
        for name, param in zip(param_names, all_params):
            # gradient drift
            param -= lr * param.grad
            # injected Gaussian noise
            noise = torch.randn_like(param) * math.sqrt(2 * lr)
            param += noise
            # reset gradient
            param.grad.zero_()

    # f) Collect samples after burn-in & according to thinning
    if it >= burn_in and (it - burn_in) % thin == 0:
        for name, param in zip(param_names, all_params):
            samples[name].append(param.clone())

# 3) Compute posterior-mean for each parameter
post_means = {
    name: torch.stack(vals, dim=0).mean(0)
    for name, vals in samples.items()
}

# 4) Overwrite model parameters with posterior means
with torch.no_grad():
    for param, name in zip(all_params, param_names):
        param.copy_(post_means[name])

# 5) Final inference & saving predictions
model.eval()
with torch.no_grad():
    out1, out2, out3 = model(x_full)
    preds = torch.cat([out1, out2, out3], dim=1).cpu().numpy()

# Prepare DataFrame and save
os.makedirs('/mnt/data', exist_ok=True)
df = pd.DataFrame(preds, columns=["Phi1","Phi","Phi2"])
df.insert(0, "Phase", 1)
output_path = '/mnt/data/IPF_20_pred_full_sgld.csv'
df.to_csv(output_path, index=False)
print(f"✅ Saved fully stochastic SGLD predictions to {output_path}")


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn