In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import h5py
from tqdm.auto import tqdm


In [2]:
# from https://github.com/biopython/biopython/blob/master/Bio/Data/IUPACData.py
protein_letters_1to3 = {
    "A": "Ala",
    "C": "Cys",
    "D": "Asp",
    "E": "Glu",
    "F": "Phe",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "K": "Lys",
    "L": "Leu",
    "M": "Met",
    "N": "Asn",
    "P": "Pro",
    "Q": "Gln",
    "R": "Arg",
    "S": "Ser",
    "T": "Thr",
    "V": "Val",
    "W": "Trp",
    "Y": "Tyr",
}

protein_letters_3to1 = {value.upper(): key for key, value in protein_letters_1to3.items()}
protein_letters_3to1['HIE'] = "H"
protein_letters_3to1['HIP'] = "H"
protein_letters_3to1['HID'] = "H"
protein_letters_3to1['CYX'] = "C"


In [3]:
h5files = [
    # "md_test_out.hdf5",
    "md_esm_if_0_5000.hdf5",
    "md_esm_if_5000_10000.hdf5",
    "md_esm_if_10000_.hdf5"
]
available_pdbids = []
for filename in h5files:
    with h5py.File(filename) as f:
        available_pdbids.extend(list(f.keys()))

In [4]:
len(available_pdbids)

14453

In [5]:
def read_pdbid(train_idx):
  with open(train_idx) as f:
    for line in f:
      line = line.strip()
      if len(line) == 4:
        yield line


DATADIR = Path("MiSaTo-dataset")

train_idx = list(read_pdbid(DATADIR/ "data/MD/splits/train_MD.txt"))
val_idx = list(read_pdbid(DATADIR / "data/MD/splits/val_MD.txt"))
test_idx = list(read_pdbid(DATADIR / "data/MD/splits/test_MD.txt"))

In [6]:
print("train_idx:\t", len(train_idx), "\t--->", len(set(train_idx) & set(available_pdbids)))
print("val_idx:\t", len(val_idx), "\t--->", len(set(val_idx) & set(available_pdbids)))
print("test_idx:\t", len(test_idx), "\t--->", len(set(test_idx) & set(available_pdbids)))

train_idx:	 13765 	---> 11759
val_idx:	 1595 	---> 1342
test_idx:	 1612 	---> 1352


In [18]:
data_folders = [
    "out_big",
    "out_big_5000",
    "out_big_10000"
]
all_sequences_info = []

skipped_data = []

train_set = set(train_idx)
val_set = set(val_idx)
test_set = set(test_idx)

for data_folder in tqdm(data_folders):
    folder_path = Path(data_folder)
    collected_dir = folder_path / "pdb" / "collected"
    filenames = list(collected_dir.glob("*.csv"))
    for filename in filenames:
        df = pd.read_csv(filename)
        if df[["N", "CA", "C"]].isna().values.any():
            skipped_data.append(filename.stem)
            continue  # skip everything with NaNs
        df['aa'] = df.residue_name.apply(lambda x: protein_letters_3to1[x])
        full_sequence = "".join(df.aa.values)
        chain_aa = {
            chain: "".join(df.loc[df.chain == chain, "aa"].values) 
            for chain in df.chain.unique()
        }
        pdbid = filename.stem
        split_name = None
        if pdbid in train_set:
            split_name = "train"
        elif pdbid in val_set:
            split_name = "val"
        elif pdbid in test_set:
            split_name = "test"

        all_sequences_info.append({
            "pdbid": pdbid,
            "full": 1,
            "chain": -1,
            "sequence": full_sequence,
            "split_name": split_name
        })
        for chain, seq in chain_aa.items():
            all_sequences_info.append({
                "pdbid": pdbid,
                "full": 0,
                "chain": chain,
                "sequence": seq,
                "split_name": split_name
            })
        # break

  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
df = pd.DataFrame(all_sequences_info)

In [11]:
df = pd.read_csv(filename)

In [23]:
df.pdbid.unique().shape

(14453,)

In [24]:
df.to_csv("misato_sequences_info.csv", index=None)

In [28]:
pd.DataFrame(sorted(skipped_data)).to_csv("skipped_pdbids.csv", header=None, index=None)