In [1]:
import numpy as np
import sidechainnet as scn
from glob import glob
import os
import prody as pr
from sidechainnet.dataloaders.SCNProtein import SCNProtein
import tqdm.notebook as tqdm


In [2]:
TEST_DIR = "/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103"
VALID_DIR = "/net/pulsar/home/koes/jok120/openfold/data/validation/cameo/20220116"

USE_DIR = TEST_DIR

In [3]:
basenames = [os.path.splitext(os.path.basename(f))[0] for f in glob(os.path.join(USE_DIR, "fasta_dir", "*.fasta"))]
seq_lengths = [len(open(os.path.join(USE_DIR, "fasta_dir", f"{basename}.fasta")).readlines()[1].strip()) for basename in basenames]

# Now filter both lists to only include sequences with length <= 700
basenames = [basename for basename, seq_length in zip(basenames, seq_lengths) if seq_length <= 700]
seq_lengths = [seq_length for seq_length in seq_lengths if seq_length <= 700]
cif_paths = [os.path.join(USE_DIR, "data_dir", f"{basename[:4]}.cif") for basename in basenames]

In [None]:
# Use prody to parse the corresponding protein chains from the cif files within TEST_DIR/data_dir
# chains = [pr.parseMMCIF(os.path.join(TEST_DIR, "data_dir", f"{basename[:4]}.cif"), \
#                         chain=basename.split("_")[1]) \
#         for basename in basenames[:3]]

# def get_scnproteins_from_cif_dir(cif_dir):
#     """Return a list of sidechainnet proteins from the cif files in cif_dir."""
#     basenames = [os.path.splitext(os.path.basename(f))[0] for f in glob(os.path.join(cif_dir, "*.cif"))]
#     chains = [pr.parseMMCIF(os.path.join(cif_dir, f"{basename[:4]}.cif"), \
#                             chain=basename.split("_")[1]) \
#             for basename in basenames]
#     scnproteins = [scn.SCNPDBProtein.from_prody_chain(c) for c in chains]
#     return scnproteins

In [None]:
p = SCNProtein.from_cif(
    '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/data_dir/7e1b.cif',
    chain="A", 
    pdbid='7e1b', 
    include_resolution=False)

In [None]:
p.to_3Dmol()

In [None]:
print(p.seq)
"PDSADQA" in p.seq


# Identifying Gaps Via Prody

In [None]:
ag, header = pr.parseMMCIF(
    '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/data_dir/7e1b.cif',
    chain="A",
    header=True)

In [None]:
# Identify the missing residues using the header/_pdbx_unobs_or_zero_occ_residues field
missing_residues = ag.getHierView().getHeader().get('_pdbx_unobs_or_zero_occ_residues')

In [None]:
header.keys()

In [None]:
header['A'].sequence

In [None]:
"PDSADQA" in header['A'].sequence

In [None]:
pr.parseSTAR('/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/data_dir/7e1b.cif')

In [None]:
failed_valid = []
for cif_path, basename in tqdm.tqdm(zip(cif_paths, basenames), total=len(basenames)):
    chain = basename.split("_")[1]
    pnid = basename
    try:
        p = SCNProtein.from_cif(
                                cif_path, 
                                chid=chain, 
                                pdbid=pnid,
                                include_resolution=False)
                                # return_sequences=True)
    except ValueError:
        failed_valid.append(basename)

In [None]:
len(failed_valid), len(basenames), len(basenames)-len(failed_valid)

In [4]:
failed = []
for cif_path, basename in tqdm.tqdm(zip(cif_paths, basenames), total=len(basenames), smoothing=0):
    chain = basename.split("_")[1]
    pnid = basename
    try:
        p = SCNProtein.from_cif(
                                cif_path, 
                                chid=chain, 
                                pdbid=pnid,
                                include_resolution=False,)
                                # return_sequences=True)
    except ValueError:
        failed.append(basename)

  0%|          | 0/708 [00:00<?, ?it/s]

In [6]:
len(failed), len(basenames), len(basenames)-len(failed) 

(583, 708, 125)

In [None]:
# Move the failed pkl, pdb, and png files out of the min data directory
for basename in failed:
    print(basename)
    for file in tqdm.tqdm_notebook(
        glob(os.path.join("/net/pulsar/home/koes/jok120/scnmin_evaltest230412/min", f"{basename}*")), total=len(failed)):
        !mv $file {file.replace("/min/", "/had_gaps/")}

In [17]:
# Create new data directories for the validation and test sets
NEW_VAL_STRUCTURES_DIR = "/net/pulsar/home/koes/jok120/openfold/data/validation/cameo/20220116/minimized/data_dir"
NEW_VAL_ALIGNMENTS_DIR = "/net/pulsar/home/koes/jok120/openfold/data/validation/cameo/20220116/minimized/alignments"
OLD_VAL_STRUCTURES_DIR = "/net/pulsar/home/koes/jok120/openfold/data/validation/cameo/20220116/data_dir"
OLD_VAL_ALIGNMENTS_DIR = "/net/pulsar/home/koes/jok120/openfold/data/validation/cameo/20220116/alignments"

for d in [NEW_VAL_STRUCTURES_DIR, NEW_VAL_ALIGNMENTS_DIR]:
    if not os.path.exists(d):
        os.makedirs(d)

NEW_TEST_STRUCTURES_DIR = "/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/minimized/data_dir"
NEW_TEST_ALIGNMENTS_DIR = "/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/minimized/alignments"
OLD_TEST_STRUCTURES_DIR = "/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/data_dir"
OLD_TEST_ALIGNMENTS_DIR = "/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments"

for d in [NEW_TEST_STRUCTURES_DIR, NEW_TEST_ALIGNMENTS_DIR]:
    if not os.path.exists(d):
        os.makedirs(d)

# Copy the minimized pdb files to the new data directories, and copy the corresponding alignment files
min_valid_files = glob(os.path.join("/net/pulsar/home/koes/jok120/scnmin_eval230412/min", "*.pdb"))
min_test_files = glob(os.path.join("/net/pulsar/home/koes/jok120/scnmin_evaltest230412/min", "*.pdb"))

for f in tqdm.tqdm_notebook(min_valid_files, total=len(min_valid_files)):
    basename = os.path.splitext(os.path.basename(f))[0]
    print(basename)
    !cp $f {os.path.join(NEW_VAL_STRUCTURES_DIR, basename+".pdb")}
    !cp -R {os.path.join(OLD_VAL_ALIGNMENTS_DIR, basename)} {os.path.join(NEW_VAL_ALIGNMENTS_DIR, basename)}

for f in tqdm.tqdm_notebook(min_test_files, total=len(min_test_files)):
    basename = os.path.splitext(os.path.basename(f))[0]
    print(basename)
    !cp $f {os.path.join(NEW_TEST_STRUCTURES_DIR, basename+".pdb")}
    !cp -R {os.path.join(OLD_TEST_ALIGNMENTS_DIR, basename)} {os.path.join(NEW_TEST_ALIGNMENTS_DIR, basename)}

  0%|          | 0/23 [00:00<?, ?it/s]

7fbp_B
7kuw_A
7v5y_B
7bhy_A
7nf9_A
7ee3_C
7dfe_A
7puo_A
7atr_A
7dkk_A
7f7n_A
7mu9_A
7vnb_A
7prd_A
7ofn_A
7f0h_A
7l8n_A
7mwr_A
7mcc_A
7dmf_A
7dut_A
7wgk_A
7b7t_A


  0%|          | 0/100 [00:00<?, ?it/s]

7e4j_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7e4j_A': No such file or directory
7lt7_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7lt7_A': No such file or directory
7mnv_B
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7mnv_B': No such file or directory
7mla_B
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7mla_B': No such file or directory
7mnk_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7mnk_A': No such file or directory
7eym_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7eym_A': No such file or directory
7lxk_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/test/cameo/20230103/alignments/7lxk_A': No such file or directory
7eyl_A
cp: cannot stat '/net/pulsar/home/koes/jok120/openfold/data/te

# Minimizing Data

In [4]:
# Loop through all of the cif paths, and for each one, extract the chain corresponding to the basename
# Load the chain into a sidechainnet protein object, and save it to a pickle file in UNMIN_PATH
UNMIN_PATH = "/net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin"
MIN_PATH = "/net/pulsar/home/koes/jok120/scnmin_evaltest230412/min"
os.makedirs(UNMIN_PATH, exist_ok=True)
os.makedirs(MIN_PATH, exist_ok=True)
for cif_path,basename in tqdm.tqdm(zip(cif_paths, basenames), total=len(basenames)):
    chain = basename.split("_")[1]
    pnid = basename
    try:
        p = SCNProtein.from_cif(cif_path, chid=chain, pdbid=pnid, include_resolution=False)
        
    except ValueError as e:
        if "The observed sequence" not in str(e):
            print(e)
        continue
    p.pickle(os.path.join(UNMIN_PATH, f"{pnid}.pkl"))
    print(f"Saved {pnid} to {os.path.join(UNMIN_PATH, f'{pnid}.pkl')}")


  0%|          | 0/708 [00:00<?, ?it/s]

Saved 7p20_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7p20_A.pkl
Saved 7lxs_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7lxs_A.pkl
Saved 7fiw_B to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7fiw_B.pkl
Saved 7dup_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7dup_A.pkl
Saved 7x15_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7x15_A.pkl
Saved 7erv_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7erv_A.pkl
Saved 7lt7_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7lt7_A.pkl
Saved 7elf_C to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7elf_C.pkl
Saved 7vna_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7vna_A.pkl
Saved 7nqd_B to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7nqd_B.pkl
Saved 7znx_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7znx_A.pkl
Saved 7rxe_A to /net/pulsar/home/koes/jok120/scnmin_evaltest230412/unmin/7rx

In [None]:
p = SCNProtein.from_pkl(os.path.join(UNMIN_PATH, "7lxs_A.pkl"))

In [None]:
p.to_3Dmol()

In [None]:
p.fastbuild(add_hydrogens=True, inplace=True)

In [None]:
p.minimize()