In [1]:
import pathlib
import torch
from absl import app
from absl import flags
from esm import pretrained, MSATransformer, FastaBatchedDataset
from tqdm import tqdm
from Bio import PDB
from typing import List

In [2]:
def pdb_to_sequences(pdb_filepath: str) -> List[str]:
    pdb_parser = PDB.PDBParser()
    structure = pdb_parser.get_structure("X", pdb_filepath)
    # Using C-N
    ppb = PDB.PPBuilder()
    polypeptide_sequences = [
        str(pp.get_sequence()) for pp in ppb.build_peptides(structure)
    ]
    return polypeptide_sequences

In [5]:
sequencias = pdb_to_sequences(path)
sequencias



['RPTFYRQELNKTIWEVPERYQNLSPVG',
 'SVCAAFDTKTGLRVAVKKLSRPFQSIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQKLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDF',
 'ATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVGTPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAAQALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPP']

In [6]:
print(sum([len(element) for element in sequencias]))

329


In [7]:
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()

NameError: name 'esm' is not defined

In [8]:
len("KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE")

71

In [4]:
from rdkit import Chem
path = "../../dataset2016/3hec/3hec_protein.pdb"
path_pocket = "../../dataset2016/3hec/3hec_pocket.pdb"

In [10]:
molecule = Chem.MolFromPDBFile(path,flavor=2,sanitize=True, removeHs=True)
print(molecule.GetNumAtoms())
print(molecule.GetNumBonds(), molecule.GetNumBonds()*2)

2757
2729 5458


In [34]:
molecule = Chem.MolFromPDBFile(path_pocket, flavor=2,sanitize=True, removeHs=True)

In [42]:
atom = molecule.GetAtoms()[100]
atom_res = atom.GetPDBResidueInfo()
print(molecule.GetNumAtoms())
print(atom_res.GetResidueNumber(), atom_res.GetResidueName(), atom_res.GetChainId())

453
55 LEU  


In [11]:
from torchdrug import data
protein = data.Protein.from_pdb(path, atom_feature="position", bond_feature="length", residue_feature="symbol")



In [12]:
protein

Protein(num_atom=2757, num_bond=5458, num_residue=418)

In [13]:
protein.residue_feature.shape

torch.Size([418, 21])

In [34]:
[data.Protein.id2residue[resid] for resid in protein.residue_type.tolist()]

['ARG',
 'PRO',
 'THR',
 'PHE',
 'TYR',
 'ARG',
 'GLN',
 'GLU',
 'LEU',
 'ASN',
 'LYS',
 'THR',
 'ILE',
 'TRP',
 'GLU',
 'VAL',
 'PRO',
 'GLU',
 'ARG',
 'TYR',
 'GLN',
 'ASN',
 'LEU',
 'SER',
 'PRO',
 'VAL',
 'GLY',
 'SER',
 'VAL',
 'CYS',
 'ALA',
 'ALA',
 'PHE',
 'ASP',
 'THR',
 'LYS',
 'THR',
 'GLY',
 'LEU',
 'ARG',
 'VAL',
 'ALA',
 'VAL',
 'LYS',
 'LYS',
 'LEU',
 'SER',
 'ARG',
 'PRO',
 'PHE',
 'GLN',
 'SER',
 'ILE',
 'ILE',
 'HIS',
 'ALA',
 'LYS',
 'ARG',
 'THR',
 'TYR',
 'ARG',
 'GLU',
 'LEU',
 'ARG',
 'LEU',
 'LEU',
 'LYS',
 'HIS',
 'MET',
 'LYS',
 'HIS',
 'GLU',
 'ASN',
 'VAL',
 'ILE',
 'GLY',
 'LEU',
 'LEU',
 'ASP',
 'VAL',
 'PHE',
 'THR',
 'PRO',
 'ALA',
 'ARG',
 'SER',
 'LEU',
 'GLU',
 'GLU',
 'PHE',
 'ASN',
 'ASP',
 'VAL',
 'TYR',
 'LEU',
 'VAL',
 'THR',
 'HIS',
 'LEU',
 'MET',
 'GLY',
 'ALA',
 'ASP',
 'LEU',
 'ASN',
 'ASN',
 'ILE',
 'VAL',
 'LYS',
 'CYS',
 'GLN',
 'LYS',
 'LEU',
 'THR',
 'ASP',
 'ASP',
 'HIS',
 'VAL',
 'GLN',
 'PHE',
 'LEU',
 'ILE',
 'TYR',
 'GLN',
 'ILE',


In [31]:
for residue_id, chain_id in zip(protein.residue_type.tolist(), protein.chain_id.tolist()):
    print("%s: %s" % (data.Protein.id2residue[residue_id], chain_id))

ARG: 1
PRO: 1
THR: 1
PHE: 1
TYR: 1
ARG: 1
GLN: 1
GLU: 1
LEU: 1
ASN: 1
LYS: 1
THR: 1
ILE: 1
TRP: 1
GLU: 1
VAL: 1
PRO: 1
GLU: 1
ARG: 1
TYR: 1
GLN: 1
ASN: 1
LEU: 1
SER: 1
PRO: 1
VAL: 1
GLY: 1
SER: 1
VAL: 1
CYS: 1
ALA: 1
ALA: 1
PHE: 1
ASP: 1
THR: 1
LYS: 1
THR: 1
GLY: 1
LEU: 1
ARG: 1
VAL: 1
ALA: 1
VAL: 1
LYS: 1
LYS: 1
LEU: 1
SER: 1
ARG: 1
PRO: 1
PHE: 1
GLN: 1
SER: 1
ILE: 1
ILE: 1
HIS: 1
ALA: 1
LYS: 1
ARG: 1
THR: 1
TYR: 1
ARG: 1
GLU: 1
LEU: 1
ARG: 1
LEU: 1
LEU: 1
LYS: 1
HIS: 1
MET: 1
LYS: 1
HIS: 1
GLU: 1
ASN: 1
VAL: 1
ILE: 1
GLY: 1
LEU: 1
LEU: 1
ASP: 1
VAL: 1
PHE: 1
THR: 1
PRO: 1
ALA: 1
ARG: 1
SER: 1
LEU: 1
GLU: 1
GLU: 1
PHE: 1
ASN: 1
ASP: 1
VAL: 1
TYR: 1
LEU: 1
VAL: 1
THR: 1
HIS: 1
LEU: 1
MET: 1
GLY: 1
ALA: 1
ASP: 1
LEU: 1
ASN: 1
ASN: 1
ILE: 1
VAL: 1
LYS: 1
CYS: 1
GLN: 1
LYS: 1
LEU: 1
THR: 1
ASP: 1
ASP: 1
HIS: 1
VAL: 1
GLN: 1
PHE: 1
LEU: 1
ILE: 1
TYR: 1
GLN: 1
ILE: 1
LEU: 1
ARG: 1
GLY: 1
LEU: 1
LYS: 1
TYR: 1
ILE: 1
HIS: 1
SER: 1
ALA: 1
ASP: 1
ILE: 1
ILE: 1
HIS: 1
ARG: 1
ASP: 1
LEU: 1
LYS: 1

In [40]:
(protein.chain_id == 1).sum()

tensor(329)

In [41]:
(protein.chain_id == 0).sum()

tensor(89)