In [14]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
biopython is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [15]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl

Processing /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl
torch-geometric is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# **RNA_Dataset Preparation** 

In [16]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from Bio.Seq import Seq  # ✅ Biopython import

NUC_MAP = {'A':0, 'U':1, 'G':2, 'C':3}

def clean_sequence(seq):
    """
    Biologically safe cleaning using Biopython.
    Removes unknown nucleotides and converts to uppercase.
    """
    seq_obj = Seq(seq.upper())
    cleaned = "".join([n for n in str(seq_obj) if n in NUC_MAP])
    return cleaned

def one_hot(seq):
    x = torch.zeros(len(seq), 4)
    for i, s in enumerate(seq):
        x[i, NUC_MAP[s]] = 1   # ✅ now guaranteed clean
    return x

class PairedRNADataset(Dataset):

    def __init__(self, seq_csv, label_csv, use_copy=1, max_length=1000):

        # -------- queries --------
        self.q_df = pd.read_csv(seq_csv)
        self.max_length = max_length

        # -------- reference structures (ID → coords, sorted) --------
        labels = pd.read_csv(label_csv, low_memory=False)
        labels = labels[labels["copy"] == use_copy]

        labels["struct_id"] = labels["ID"].str.split("_").str[0]
        labels["res_idx"]   = labels["ID"].str.split("_").str[1].astype(int)

        self.structures = {}

        for k, g in labels.groupby("struct_id"):
            g = g.sort_values("res_idx")   # ✅ critical fix
            coords = g[["x_1", "y_1", "z_1"]].values.astype(np.float32)
            self.structures[k] = torch.from_numpy(coords)

        # keep only valid query indices
        self.valid_idx = [
            i for i, sid in enumerate(self.q_df["target_id"])
            if sid in self.structures
        ]

    def __len__(self):
        return len(self.valid_idx)

    def __getitem__(self, i):

        idx = self.valid_idx[i]
        row = self.q_df.iloc[idx]

        seq = clean_sequence(row["sequence"])  # ✅ Biopython cleaning
        sid = row["target_id"]

        coords_full = self.structures[sid]

        L = len(seq)

        # (safety – ideally these should match)
        Lc = coords_full.shape[0]
        L = min(L, Lc)

        seq = seq[:L]
        coords_full = coords_full[:L]

        if L > self.max_length:
            start = np.random.randint(0, L - self.max_length + 1)
            end = start + self.max_length

            seq = seq[start:end]
            coords = coords_full[start:end]   # ✅ aligned crop
        else:
            coords = coords_full

        x = one_hot(seq)

        pos = torch.arange(len(seq)).float().unsqueeze(-1) / len(seq)
        x = torch.cat([x, pos], dim=1)

        return sid, x, coords

In [17]:
#Dataset object create
dataset = PairedRNADataset(
    seq_csv="/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv",
    label_csv="/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    use_copy=1,
    max_length=1000
)

# Total sequences
print("Total sequences:", len(dataset))

# Inspecting the sequence
sid, x, coords = dataset[0]
print("Target ID:", sid)
print("Sequence tensor shape:", x.shape)     # [L, 5] → 4 one-hot + 1 position
print("Coordinates shape:", coords.shape)   # [L, 3]

# Summaryof the first five sequence
for i in range(5):
    sid, x, coords = dataset[i]
    print(f"{i}: {sid}, x: {x.shape}, coords: {coords.shape}")

Total sequences: 5716
Target ID: 4TNA
Sequence tensor shape: torch.Size([76, 5])
Coordinates shape: torch.Size([76, 3])
0: 4TNA, x: torch.Size([76, 5]), coords: torch.Size([76, 3])
1: 6TNA, x: torch.Size([76, 5]), coords: torch.Size([76, 3])
2: 1TRA, x: torch.Size([76, 5]), coords: torch.Size([76, 3])
3: 1TN2, x: torch.Size([76, 5]), coords: torch.Size([76, 3])
4: 1TN1, x: torch.Size([76, 5]), coords: torch.Size([76, 3])


In [18]:
for i in range(10):
    sid, x, coords = dataset[i]
    print(i, sid, x.shape, coords.shape)

0 4TNA torch.Size([76, 5]) torch.Size([76, 3])
1 6TNA torch.Size([76, 5]) torch.Size([76, 3])
2 1TRA torch.Size([76, 5]) torch.Size([76, 3])
3 1TN2 torch.Size([76, 5]) torch.Size([76, 3])
4 1TN1 torch.Size([76, 5]) torch.Size([76, 3])
5 2TRA torch.Size([75, 5]) torch.Size([75, 3])
6 3TRA torch.Size([75, 5]) torch.Size([75, 3])
7 4TRA torch.Size([76, 5]) torch.Size([76, 3])
8 1RNA torch.Size([14, 5]) torch.Size([14, 3])
9 1ELH torch.Size([25, 5]) torch.Size([25, 3])


In [19]:
missing = []

for sid in dataset.q_df["target_id"]:
    if sid not in dataset.structures:
        missing.append(sid)

print("Missing structures:", len(missing))

Missing structures: 0


In [20]:
for i in range(10):
    idx = dataset.valid_idx[i]
    sid_from_df = dataset.q_df.iloc[idx]["target_id"]

    sid, _, _ = dataset[i]

    print(sid_from_df, sid)

4TNA 4TNA
6TNA 6TNA
1TRA 1TRA
1TN2 1TN2
1TN1 1TN1
2TRA 2TRA
3TRA 3TRA
4TRA 4TRA
1RNA 1RNA
1ELH 1ELH


In [21]:
labels = pd.read_csv(
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    low_memory=False
)
tmp = labels[labels["ID"].str.startswith("4TNA_")]
print(tmp["ID"].head())

774838    4TNA_1
774839    4TNA_2
774840    4TNA_3
774841    4TNA_4
774842    4TNA_5
Name: ID, dtype: object


# **Graph Building**

In [22]:
import torch
from torch_geometric.data import Data

def build_graph(x, coords, k=2, self_loops=False):
    """
    Build PyG graph for RNA sequence with k-nearest neighbors in sequence.

    Args:
        x          : [L, F] node features (one-hot + pos)
        coords     : [L, 3] 3D coordinates
        k          : int, number of neighbors on each side
        self_loops : bool, whether to add self-loops

    Returns:
        PyG Data object with:
            x, edge_index, edge_attr, y
    """
    L = x.size(0)
    
    edge_index = []
    edge_attr = []

    for i in range(L):
        for j in range(max(0, i - k), min(L, i + k + 1)):
            if i == j and not self_loops:
                continue
            edge_index.append([i, j])
            edge_attr.append([abs(i - j)])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()  # [2, num_edges]
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)                     # [num_edges, 1]

    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=coords
    )

Checking graph

In [23]:
# pick the first sequence from your dataset
sid, x, coords = dataset[0]

# build graph, for example k=2, no self-loops
data = build_graph(x, coords, k=2, self_loops=False)

print(f"Target ID: {sid}")
print(f"Number of nodes: {data.x.shape[0]} (should match sequence length: {x.shape[0]})")
print(f"Number of edges: {data.edge_index.shape[1]}")

# first 10 edges
print("First 10 edges:")
print(data.edge_index[:, :10])

# first 10 edge attributes (sequence distance)
print("First 10 edge attributes:")
print(data.edge_attr[:10])

# check alignment of nodes with coordinates
for i in range(min(5, data.x.shape[0])):
    print(f"Node {i} -> x: {data.x[i].tolist()}, coords: {data.y[i].tolist()}")

Target ID: 4TNA
Number of nodes: 76 (should match sequence length: 76)
Number of edges: 298
First 10 edges:
tensor([[0, 0, 1, 1, 1, 2, 2, 2, 2, 3],
        [1, 2, 0, 2, 3, 0, 1, 3, 4, 1]])
First 10 edge attributes:
tensor([[1.],
        [2.],
        [1.],
        [1.],
        [2.],
        [2.],
        [1.],
        [1.],
        [2.],
        [2.]])
Node 0 -> x: [0.0, 0.0, 1.0, 0.0, 0.0], coords: [25.83300018310547, 2.611999988555908, 55.90299987792969]
Node 1 -> x: [0.0, 0.0, 0.0, 1.0, 0.01315789483487606], coords: [31.16200065612793, 0.6359999775886536, 56.24800109863281]
Node 2 -> x: [0.0, 0.0, 1.0, 0.0, 0.02631578966975212], coords: [35.77199935913086, -0.4339999854564667, 53.6510009765625]
Node 3 -> x: [0.0, 0.0, 1.0, 0.0, 0.03947368264198303], coords: [38.604000091552734, -1.8949999809265137, 49.53200149536133]
Node 4 -> x: [1.0, 0.0, 0.0, 0.0, 0.05263157933950424], coords: [39.448001861572266, -1.7369999885559082, 44.23099899291992]


# Train/Val Split

In [24]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(data_list, test_size=0.1, random_state=42)

print(len(train_data), len(val_data))

NameError: name 'data_list' is not defined

# DataLoader

In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=8, shuffle=False)

# Baseline GNN Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class RNAGCN(nn.Module):
    def __init__(self, in_channels, hidden_dim=128):
        super().__init__()
        
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, 3)  # output xyz
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.conv3(x, edge_index)
        
        return x

# Initialize Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

in_channels = train_data[0].x.size(1)

model = RNAGCN(in_channels).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Training Loop

In [None]:
def train():
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        
        optimizer.zero_grad()
        
        pred = model(batch)
        loss = criterion(pred, batch.y)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Validation

In [None]:
def evaluate():
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y)
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Run Training

In [None]:
for epoch in range(20):
    train_loss = train()
    val_loss = evaluate()
    
    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

# Coordinate Centering

In [None]:
def center_coordinates(coords):
    """
    Center coordinates to zero mean.
    """
    center = coords.mean(dim=0, keepdim=True)
    return coords - center


def build_graph(x, coords, k=2, self_loops=False):
    L = x.size(0)
    
    # Center coordinates
    coords = center_coordinates(coords)
    
    edge_index = []
    edge_attr = []

    for i in range(L):
        for j in range(max(0, i - k), min(L, i + k + 1)):
            if i == j and not self_loops:
                continue

            edge_index.append([i, j])

            # Edge distance feature
            dist = torch.norm(coords[i] - coords[j])
            edge_attr.append([dist.item()])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=coords
    )

# RMSD Metric Implementation

In [None]:
def compute_rmsd(pred, target):
    """
    pred, target: [N, 3]
    """
    return torch.sqrt(torch.mean(torch.sum((pred - target)**2, dim=1)))

In [None]:
# Update Evaluation
    model.eval()
    total_loss = 0
    total_rmsd = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            pred = model(batch)
            
            loss = criterion(pred, batch.y)
            rmsd = compute_rmsd(pred, batch.y)
            
            total_loss += loss.item()
            total_rmsd += rmsd.item()
    
    return total_loss / len(val_loader), total_rmsd / len(val_loader)

# Upgrade Model to Use Edge Distance

In [None]:
# Switch from GCN → GINE (edge-aware)
from torch_geometric.nn import GINEConv
import torch.nn as nn
import torch.nn.functional as F

class RNAGINE(nn.Module):
    def __init__(self, in_channels, hidden_dim=128):
        super().__init__()
        
        nn1 = nn.Sequential(
            nn.Linear(in_channels, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.conv1 = GINEConv(nn1)
        
        nn2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        self.conv2 = GINEConv(nn2)
        
        self.lin_out = nn.Linear(hidden_dim, 3)
        
    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        
        x = self.lin_out(x)
        return x

In [None]:
# Initialize
model = RNAGINE(in_channels).to(device)

# EGNN (Equivariant Model)

In [None]:
class EGNNLayer(nn.Module):
    def __init__(self, feat_dim):
        super().__init__()
        
        self.edge_mlp = nn.Sequential(
            nn.Linear(2*feat_dim + 1, feat_dim),
            nn.ReLU(),
            nn.Linear(feat_dim, feat_dim)
        )
        
        self.node_mlp = nn.Sequential(
            nn.Linear(feat_dim, feat_dim),
            nn.ReLU(),
            nn.Linear(feat_dim, feat_dim)
        )
        
    def forward(self, x, pos, edge_index):
        row, col = edge_index
        
        # distance squared
        diff = pos[row] - pos[col]
        dist2 = torch.sum(diff**2, dim=1, keepdim=True)
        
        edge_input = torch.cat([x[row], x[col], dist2], dim=1)
        m_ij = self.edge_mlp(edge_input)
        
        # coordinate update
        pos_update = diff * m_ij.mean(dim=1, keepdim=True)
        
        pos = pos + torch.zeros_like(pos).index_add_(0, row, pos_update)
        
        # node update
        agg = torch.zeros_like(x).index_add_(0, row, m_ij)
        x = x + self.node_mlp(agg)
        
        return x, pos

# Full EGNN Model

In [None]:
class RNAEGNN(nn.Module):
    def __init__(self, in_channels, hidden_dim=128, layers=3):
        super().__init__()
        
        self.embedding = nn.Linear(in_channels, hidden_dim)
        
        self.layers = nn.ModuleList([
            EGNNLayer(hidden_dim) for _ in range(layers)
        ])
        
        self.out = nn.Linear(hidden_dim, 3)
        
    def forward(self, data):
        x = self.embedding(data.x)
        pos = data.pos
        
        for layer in self.layers:
            x, pos = layer(x, pos, data.edge_index)
        
        return pos

# Initialize

In [None]:
model = RNAEGNN(in_channels).to(device)