<a href="https://colab.research.google.com/github/hadwin-357/ProteinMPNN_breakdown/blob/main/ProteinMPNN_breakdwon_001_traindata_structure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Break down ProteinMPNN training data
This is readme form ProteinMPNN describing training dataset
[train_dataset](https://github.com/dauparas/ProteinMPNN/blob/main/training/README.md)



Training set for ProteinMPNN curated by Ivan Anishchanko.

Each PDB entry is represented as a collection of .pt files:
    PDBID_CHAINID.pt - contains CHAINID chain from PDBID
    PDBID.pt         - metadata and information on biological assemblies

PDBID_CHAINID.pt has the following fields:
    seq  - amino acid sequence (string)
    xyz  - atomic coordinates [L,14,3]
    mask - boolean mask [L,14]
    bfac - temperature factors [L,14]
    occ  - occupancy [L,14] (is 1 for most atoms, <1 if alternative conformations are present)

PDBID.pt:
    method        - experimental method (str)
    date          - deposition date (str)
    resolution    - resolution (float)
    chains        - list of CHAINIDs (there is a corresponding PDBID_CHAINID.pt file for each of these)
    tm            - pairwise similarity between chains (TM-score,seq.id.,rmsd from TM-align) [num_chains,num_chains,3]
    asmb_ids      - biounit IDs as in the PDB (list of str)
    asmb_details  - how the assembly was identified: author, or software, or smth else (list of str)
    asmb_method   - PISA or smth else (list of str)

    asmb_chains    - list of chains which each biounit is composed of (list of str, each str contains comma separated CHAINIDs)
    asmb_xformIDX  - (one per biounit) xforms to be applied to chains from asmb_chains[IDX], [n,4,4]
                     [n,:3,:3] - rotation matrices
                     [n,3,:3] - translation vectors

list.csv:
   CHAINID    - chain label, PDBID_CHAINID
   DEPOSITION - deposition date
   RESOLUTION - structure resolution
   HASH       - unique 6-digit hash for the sequence
   CLUSTER    - sequence cluster the chain belongs to (clusters were generated at seqID=30%)
   SEQUENCE   - reference amino acid sequence

valid_clusters.txt - clusters used for validation

test_clusters.txt - clusters used for testing

In [2]:
!wget https://files.ipd.uw.edu/pub/training_sets/pdb_2021aug02_sample.tar.gz

--2024-05-04 16:08:35--  https://files.ipd.uw.edu/pub/training_sets/pdb_2021aug02_sample.tar.gz
Resolving files.ipd.uw.edu (files.ipd.uw.edu)... 128.95.160.135, 128.95.160.134, 2607:4000:406::160:134, ...
Connecting to files.ipd.uw.edu (files.ipd.uw.edu)|128.95.160.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49690915 (47M) [application/octet-stream]
Saving to: ‘pdb_2021aug02_sample.tar.gz’


2024-05-04 16:08:36 (38.5 MB/s) - ‘pdb_2021aug02_sample.tar.gz’ saved [49690915/49690915]



In [3]:
!tar xvf "pdb_2021aug02_sample.tar.gz"

./pdb_2021aug02_sample/
./pdb_2021aug02_sample/README
./pdb_2021aug02_sample/list.csv
./pdb_2021aug02_sample/pdb/
./pdb_2021aug02_sample/pdb/l3/
./pdb_2021aug02_sample/pdb/l3/5l3p.pt
./pdb_2021aug02_sample/pdb/l3/5l3g_A.pt
./pdb_2021aug02_sample/pdb/l3/5l3f.pt
./pdb_2021aug02_sample/pdb/l3/5l3r_B.pt
./pdb_2021aug02_sample/pdb/l3/4l3o_G.pt
./pdb_2021aug02_sample/pdb/l3/1l3b_E.pt
./pdb_2021aug02_sample/pdb/l3/3l3t_C.pt
./pdb_2021aug02_sample/pdb/l3/6l3y_A.pt
./pdb_2021aug02_sample/pdb/l3/5l3p_DB.pt
./pdb_2021aug02_sample/pdb/l3/3l36_B.pt
./pdb_2021aug02_sample/pdb/l3/2l3y_A.pt
./pdb_2021aug02_sample/pdb/l3/2l36.pt
./pdb_2021aug02_sample/pdb/l3/5l35_G.pt
./pdb_2021aug02_sample/pdb/l3/3l33_F.pt
./pdb_2021aug02_sample/pdb/l3/2l3n.pt
./pdb_2021aug02_sample/pdb/l3/2l3x.pt
./pdb_2021aug02_sample/pdb/l3/1l3r_B.pt
./pdb_2021aug02_sample/pdb/l3/1l3g_A.pt
./pdb_2021aug02_sample/pdb/l3/4l3e.pt
./pdb_2021aug02_sample/pdb/l3/5l39_E.pt
./pdb_2021aug02_sample/pdb/l3/4l3f_A.pt
./pdb_2021aug02_sample/pdb

In [4]:
import torch
data=torch.load('/content/pdb_2021aug02_sample/pdb/l3/1l30.pt')
print(data.keys())

dict_keys(['method', 'date', 'resolution', 'chains', 'seq', 'id', 'asmb_chains', 'asmb_details', 'asmb_method', 'asmb_ids', 'asmb_xform0', 'tm'])


In [10]:
print(data['chains'])

['A']


In [11]:
data_A=torch.load('/content/pdb_2021aug02_sample/pdb/l3/1l30_A.pt')
print(data_A.keys())

dict_keys(['seq', 'xyz', 'mask', 'bfac', 'occ'])


In [14]:
print(data_A['xyz'][0,:,:])
print(data_A['mask'][0,:])
print(data_A['bfac'][0,:])
print(data_A['occ'][0,:])


tensor([[ 36.5060, -24.8400,   8.9400],
        [ 36.8400, -23.4580,   9.0180],
        [ 35.5980, -22.7920,   9.5260],
        [ 34.5200, -23.2510,   9.2150],
        [ 37.1590, -22.9150,   7.6350],
        [ 37.4560, -21.4620,   7.8160],
        [ 39.1430, -21.0160,   7.4100],
        [ 40.0850, -22.4920,   7.8700],
        [     nan,      nan,      nan],
        [     nan,      nan,      nan],
        [     nan,      nan,      nan],
        [     nan,      nan,      nan],
        [     nan,      nan,      nan],
        [     nan,      nan,      nan]])
tensor([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.])
tensor([30.6300, 24.8100, 19.2000, 24.2300, 18.8300, 46.6300, 40.2400, 39.2400,
            nan,     nan,     nan,     nan,     nan,     nan])
tensor([1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.])


In [16]:
print(data_A['xyz'].shape) # seq_length, max_atom_idex_for_a_residue, 3d_cooridnates

torch.Size([164, 14, 3])


In [17]:
#digitizing atom in a residue to index coming from here
RES_NAMES = [
    'ALA','ARG','ASN','ASP','CYS',
    'GLN','GLU','GLY','HIS','ILE',
    'LEU','LYS','MET','PHE','PRO',
    'SER','THR','TRP','TYR','VAL'
]

RES_NAMES_1 = 'ARNDCQEGHILKMFPSTWYV'

to1letter = {aaa:a for a,aaa in zip(RES_NAMES_1,RES_NAMES)}
to3letter = {a:aaa for a,aaa in zip(RES_NAMES_1,RES_NAMES)}

ATOM_NAMES = [
    ("N", "CA", "C", "O", "CB"), # ala
    ("N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2"), # arg
    ("N", "CA", "C", "O", "CB", "CG", "OD1", "ND2"), # asn
    ("N", "CA", "C", "O", "CB", "CG", "OD1", "OD2"), # asp
    ("N", "CA", "C", "O", "CB", "SG"), # cys
    ("N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2"), # gln
    ("N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2"), # glu
    ("N", "CA", "C", "O"), # gly
    ("N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2"), # his
    ("N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1"), # ile
    ("N", "CA", "C", "O", "CB", "CG", "CD1", "CD2"), # leu
    ("N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ"), # lys
    ("N", "CA", "C", "O", "CB", "CG", "SD", "CE"), # met
    ("N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ"), # phe
    ("N", "CA", "C", "O", "CB", "CG", "CD"), # pro
    ("N", "CA", "C", "O", "CB", "OG"), # ser
    ("N", "CA", "C", "O", "CB", "OG1", "CG2"), # thr
    ("N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "NE1", "CZ2", "CZ3", "CH2"), # trp
    ("N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH"), # tyr
    ("N", "CA", "C", "O", "CB", "CG1", "CG2") # val
]

idx2ra = {(RES_NAMES_1[i],j):(RES_NAMES[i],a) for i in range(20) for j,a in enumerate(ATOM_NAMES[i])}

aa2idx = {(r,a):i for r,atoms in zip(RES_NAMES,ATOM_NAMES)
          for i,a in enumerate(atoms)}
aa2idx.update({(r,'OXT'):3 for r in RES_NAMES})

In [18]:
trp=("N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "NE1", "CZ2", "CZ3", "CH2")
print(len(trp))

14


In [22]:
print(idx2ra) # how residue atoms are indexed

{('A', 0): ('ALA', 'N'), ('A', 1): ('ALA', 'CA'), ('A', 2): ('ALA', 'C'), ('A', 3): ('ALA', 'O'), ('A', 4): ('ALA', 'CB'), ('R', 0): ('ARG', 'N'), ('R', 1): ('ARG', 'CA'), ('R', 2): ('ARG', 'C'), ('R', 3): ('ARG', 'O'), ('R', 4): ('ARG', 'CB'), ('R', 5): ('ARG', 'CG'), ('R', 6): ('ARG', 'CD'), ('R', 7): ('ARG', 'NE'), ('R', 8): ('ARG', 'CZ'), ('R', 9): ('ARG', 'NH1'), ('R', 10): ('ARG', 'NH2'), ('N', 0): ('ASN', 'N'), ('N', 1): ('ASN', 'CA'), ('N', 2): ('ASN', 'C'), ('N', 3): ('ASN', 'O'), ('N', 4): ('ASN', 'CB'), ('N', 5): ('ASN', 'CG'), ('N', 6): ('ASN', 'OD1'), ('N', 7): ('ASN', 'ND2'), ('D', 0): ('ASP', 'N'), ('D', 1): ('ASP', 'CA'), ('D', 2): ('ASP', 'C'), ('D', 3): ('ASP', 'O'), ('D', 4): ('ASP', 'CB'), ('D', 5): ('ASP', 'CG'), ('D', 6): ('ASP', 'OD1'), ('D', 7): ('ASP', 'OD2'), ('C', 0): ('CYS', 'N'), ('C', 1): ('CYS', 'CA'), ('C', 2): ('CYS', 'C'), ('C', 3): ('CYS', 'O'), ('C', 4): ('CYS', 'CB'), ('C', 5): ('CYS', 'SG'), ('Q', 0): ('GLN', 'N'), ('Q', 1): ('GLN', 'CA'), ('Q', 2)