In [1]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl


Processing /kaggle/input/datasets/kurshidbasheer/biopython-offline/biopython-1.83-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
!pip install --no-index /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl

Processing /kaggle/input/datasets/kurshidbasheer/pyg-2-7-torch-2-9-cpu-py312-kur/torch_geometric-2.7.0-py3-none-any.whl
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [3]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from Bio.Seq import Seq
import random

**dataset.py**

Queries (Sequence)
train_sequences

In [4]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset

NUC_MAP = {'A':0, 'U':1, 'G':2, 'C':3}

def one_hot(seq):
    x = torch.zeros(len(seq), 4)
    for i, s in enumerate(seq):
        x[i, NUC_MAP.get(s, 0)] = 1
    return x


class QueryRNADataset(Dataset):
    def __init__(self, seq_csv, max_length=1000):

        self.df = pd.read_csv(seq_csv)
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]
        seq = row["sequence"]
        sid = row["target_id"]

        L = len(seq)
        if L > self.max_length:
            start = np.random.randint(0, L - self.max_length)
            seq = seq[start:start + self.max_length]

        x = one_hot(seq)

        pos = torch.arange(len(seq)).float().unsqueeze(-1) / len(seq)
        x = torch.cat([x, pos], dim=1)

        return sid, x


reference structures (library)
train_labels

In [5]:
class ReferenceStructureDataset(Dataset):
    def __init__(self, label_csv, use_copy=1):

        labels = pd.read_csv(label_csv, low_memory=False)

        labels = labels[labels["copy"] == use_copy]

        labels["struct_id"] = labels["ID"].str.split("_").str[0]

        self.structures = {}

        for k, g in labels.groupby("struct_id"):
            coords = g[["x_1", "y_1", "z_1"]].values.astype(np.float32)
            self.structures[k] = coords

        self.keys = list(self.structures.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):

        k = self.keys[idx]
        coords = torch.tensor(self.structures[k])

        return k, coords


Queries

In [6]:
query_ds = QueryRNADataset(
    "/kaggle/input/stanford-rna-3d-folding-2/train_sequences.csv"
)

print(len(query_ds))
sid, x = query_ds[0]
print(sid, x.shape)


5716
4TNA torch.Size([76, 5])


Reference library

In [7]:
ref_ds = ReferenceStructureDataset(
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    use_copy=1
)

print(len(ref_ds))

k, c = ref_ds[0]
print(k, c.shape)


5716
157D torch.Size([12, 3])


In [8]:
labels = pd.read_csv(
    "/kaggle/input/stanford-rna-3d-folding-2/train_labels.csv",
    low_memory=False
)

print(labels["copy"].value_counts())


copy
1      7384884
2       162331
3        25751
4        18193
5        13433
        ...   
116        458
117        458
118        458
119        458
120        458
Name: count, Length: 120, dtype: int64


https://chatgpt.com/share/69977466-cdb0-8010-9dcb-34cf1f57b18c