In [1]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl


Processing /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl

Processing /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


https://chatgpt.com/share/69977466-cdb0-8010-9dcb-34cf1f57b18c

In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

NUC_MAP = {'A':0, 'U':1, 'G':2, 'C':3}

def one_hot(seq):
    x = torch.zeros(len(seq), 4)
    for i, s in enumerate(seq):
        x[i, NUC_MAP.get(s, 0)] = 1
    return x


class PairedRNADataset(Dataset):

    def __init__(self, seq_csv, label_csv, use_copy=1, max_length=1000):

        # -------- queries --------
        self.q_df = pd.read_csv(seq_csv)
        self.max_length = max_length

        # -------- reference structures (ID → coords, sorted) --------
        labels = pd.read_csv(label_csv, low_memory=False)
        labels = labels[labels["copy"] == use_copy]

        labels["struct_id"] = labels["ID"].str.split("_").str[0]
        labels["res_idx"]   = labels["ID"].str.split("_").str[1].astype(int)

        self.structures = {}

        for k, g in labels.groupby("struct_id"):
            g = g.sort_values("res_idx")   # ✅ critical fix
            coords = g[["x_1", "y_1", "z_1"]].values.astype(np.float32)
            self.structures[k] = torch.from_numpy(coords)

        # keep only valid query indices
        self.valid_idx = [
            i for i, sid in enumerate(self.q_df["target_id"])
            if sid in self.structures
        ]

    def __len__(self):
        return len(self.valid_idx)

    def __getitem__(self, i):

        idx = self.valid_idx[i]
        row = self.q_df.iloc[idx]

        seq = row["sequence"]
        sid = row["target_id"]

        coords_full = self.structures[sid]

        L = len(seq)

        # (safety – ideally these should match)
        Lc = coords_full.shape[0]
        L = min(L, Lc)

        seq = seq[:L]
        coords_full = coords_full[:L]

        if L > self.max_length:
            start = np.random.randint(0, L - self.max_length + 1)
            end = start + self.max_length

            seq = seq[start:end]
            coords = coords_full[start:end]   # ✅ aligned crop
        else:
            coords = coords_full

        x = one_hot(seq)

        pos = torch.arange(len(seq)).float().unsqueeze(-1) / len(seq)
        x = torch.cat([x, pos], dim=1)

        return sid, x, coords

In [4]:
ds = PairedRNADataset(
    "/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv",
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    use_copy=1
)

print(len(ds))

sid, x, coords = ds[0]
print(sid, x.shape, coords.shape)

5716
4TNA torch.Size([76, 5]) torch.Size([76, 3])


In [5]:
sid, x, c = ds[0]
print(x.shape, c.shape)

torch.Size([76, 5]) torch.Size([76, 3])


In [6]:
for i in range(10):
    sid, x, coords = ds[i]
    print(i, sid, x.shape, coords.shape)

0 4TNA torch.Size([76, 5]) torch.Size([76, 3])
1 6TNA torch.Size([76, 5]) torch.Size([76, 3])
2 1TRA torch.Size([76, 5]) torch.Size([76, 3])
3 1TN2 torch.Size([76, 5]) torch.Size([76, 3])
4 1TN1 torch.Size([76, 5]) torch.Size([76, 3])
5 2TRA torch.Size([75, 5]) torch.Size([75, 3])
6 3TRA torch.Size([75, 5]) torch.Size([75, 3])
7 4TRA torch.Size([76, 5]) torch.Size([76, 3])
8 1RNA torch.Size([14, 5]) torch.Size([14, 3])
9 1ELH torch.Size([25, 5]) torch.Size([25, 3])


In [7]:
missing = []

for sid in ds.q_df["target_id"]:
    if sid not in ds.structures:
        missing.append(sid)

print("Missing structures:", len(missing))

Missing structures: 0


In [8]:
for i in range(10):
    idx = ds.valid_idx[i]
    sid_from_df = ds.q_df.iloc[idx]["target_id"]

    sid, _, _ = ds[i]

    print(sid_from_df, sid)

4TNA 4TNA
6TNA 6TNA
1TRA 1TRA
1TN2 1TN2
1TN1 1TN1
2TRA 2TRA
3TRA 3TRA
4TRA 4TRA
1RNA 1RNA
1ELH 1ELH


In [9]:
labels = pd.read_csv(
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    low_memory=False
)

In [10]:
tmp = labels[labels["ID"].str.startswith("4TNA_")]
print(tmp["ID"].head())

774838    4TNA_1
774839    4TNA_2
774840    4TNA_3
774841    4TNA_4
774842    4TNA_5
Name: ID, dtype: object


Graph building

In [11]:
import torch
from torch_geometric.data import Data

def build_graph(x, y):
    # x : [L, 5]
    # y : [L, 3]
    N = x.size(0)

    edges = []
    for i in range(N - 1):
        edges.append([i, i + 1])
        edges.append([i + 1, i])

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    return Data(x=x, edge_index=edge_index, y=y)

Sanity check

In [12]:
#Shape & edge check
sid, x, y = ds[0]

data = build_graph(x, y)

print("Nodes:", data.x.shape)
print("Edges:", data.edge_index.shape)
print("Y:", data.y.shape)

Nodes: torch.Size([76, 5])
Edges: torch.Size([2, 150])
Y: torch.Size([76, 3])


In [13]:
#Backbone connectivity check
print(data.edge_index[:, :6])

tensor([[0, 1, 1, 2, 2, 3],
        [1, 0, 2, 1, 3, 2]])


In [14]:
#Alignment check
print(x.shape[0], y.shape[0])

76 76


Model Building

In [15]:
#Defining RNAGNN for the model
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv

class RNAGNN(nn.Module):

    def __init__(self, in_dim=5, hidden=128, layers=4):
        super().__init__()

        self.embed = nn.Linear(in_dim, hidden)

        self.convs = nn.ModuleList([
            GCNConv(hidden, hidden) for _ in range(layers)
        ])

        self.norm = nn.LayerNorm(hidden)
        self.out = nn.Linear(hidden, 3)

    def forward(self, data):

        x, edge_index = data.x, data.edge_index

        x = self.embed(x)

        for conv in self.convs:
            x = conv(x, edge_index)
            x = torch.relu(x)

        x = self.norm(x)
        x = self.out(x)

        return x

In [16]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = PairedRNADataset(
    "/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv",
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    max_length=1000
)

loader = DataLoader(dataset, batch_size=1, shuffle=True)

model = RNAGNN().to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)


def center(x):
    return x - x.mean(dim=0, keepdim=True)


model.train()

for sid, x, y in loader:

    x = x[0].to(device)
    y = y[0].to(device)

    data = build_graph(x, y).to(device)

    opt.zero_grad()   # what is this ask

    pred = model(data)

    pred = center(pred)
    y    = center(y)

    # ✅ mask out NaN coordinates
    mask = ~torch.isnan(y).any(dim=1)

    pred = pred[mask]
    y    = y[mask]

    # safety (kis baat ki safty)
    #Loss function wala step kaha se suru hota hai?
    if pred.shape[0] < 2:
        continue

    loss = F.mse_loss(pred, y)

    print("SID:", sid[0])
    print("pred shape:", pred.shape)
    print("gt shape  :", y.shape)
    print("loss      :", loss.item())

    loss.backward() # Loss backward kya hai aise hi kya loss forward hot hai?
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()

    break

SID: 6O7K
pred shape: torch.Size([1000, 3])
gt shape  : torch.Size([1000, 3])
loss      : 1286.3057861328125
