In [1]:
import sys
sys.path.append('..')

In [2]:
import os

In [3]:
from DomainPrediction import BaseProtein

In [4]:
root = '../..'
data_path = os.path.join(root, 'Data/')

In [6]:
## Read Protein
protein = BaseProtein(file=os.path.join(data_path, 'GxpS_ATC_AF.pdb'))
A = [i for i in range(33,522)] ## 34-522
C = [i for i in range(637,1067)] ## 638-1067
T = [i for i in range(538, 608)] ## 539-608

In [7]:
from typing import Dict, List
from tqdm import tqdm
import numpy as np
import torch

from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils import OFProtein, atom14_to_atom37, to_pdb

from DomainPrediction.utils import helper

2024-09-10 16:48:22.714441: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 16:48:23.094333: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 16:48:23.097109: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 16:48:23.793605: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
torch.cuda.is_available()

True

In [9]:
class esmFold():
    def __init__(self, device='cpu') -> None:
        self.model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
        self.device = device

        if self.device == 'gpu':
            self.model = self.model.cuda()
            self.model.trunk.set_chunk_size(256)

    def forward(self, inputs):
        if self.device == 'gpu':
            for key in inputs:
                inputs[key] = inputs[key].cuda()
                
        with torch.no_grad():
            outputs = self.model(**inputs)

        return outputs

    def structures_from_fasta(self, file: str, save_path: str):
        records = helper.read_fasta(file)

        for rec in tqdm(records):
            outputs = self.get_structure(str(rec.seq))
            file = os.path.join(save_path, rec.id)
            self.output_to_pdb(outputs, file)

    def get_structure(self, sequence: str):
        inputs = self.tokenizer([sequence], return_tensors="pt", add_special_tokens=False)
        outputs = self.forward(inputs)

        return outputs
    
    @staticmethod
    def output_to_pdb(output: Dict, file: str, save_meta: bool = True):
        '''
            Adapted from https://github.com/huggingface/transformers/blob/979d24e7fd82a10d1457d500bef8ec3b5ddf2f8a/src/transformers/models/esm/modeling_esmfold.py#L2292
        '''
        output = {k: v.to("cpu").numpy() for k, v in output.items()}
        pdbs = []
        final_atom_positions = atom14_to_atom37(output["positions"][-1], output)
        final_atom_mask = output["atom37_atom_exists"]

        for i in range(output["aatype"].shape[0]):
            aa = output["aatype"][i]
            pred_pos = final_atom_positions[i]
            mask = final_atom_mask[i]
            resid = output["residue_index"][i] + 1

            pred = OFProtein(
                aatype=aa,
                atom_positions=pred_pos,
                atom_mask=mask,
                residue_index=resid,
                b_factors=output["plddt"][i],
            )
            pdbs.append(to_pdb(pred))

        assert output["aatype"].shape[0] == 1

        meta = {
            "predicted_aligned_error" : output["predicted_aligned_error"][0],
            "ptm" : output["ptm"]
        }

        with open(file + '.pdb', "w") as f:
            f.write(pdbs[0])

        if save_meta:
            np.savez(file + '.meta', **meta)

In [10]:
esmfold = esmFold()



In [None]:
## save pdb from a seq
output = esmfold.get_structure(protein.get_residues(T))
esmfold.output_to_pdb(output, file='test')

In [None]:
## save pdbs from a fasta file
save_path = os.path.join(data_path, 'esm_experiments/gen_1000/pdbs')
gen = os.path.join(data_path, 'esm_experiments/gen_1000/esm_inp_seq_1000.T.fasta')
esmfold.structures_from_fasta(file=gen, save_path=save_path)

In [12]:
## save pdb A domain
print(f'A domain : {protein.get_residues(A)}')
output = esmfold.get_structure(protein.get_residues(A))
file = '/nethome/kgeorge/workspace/DomainPrediction/Data/GxpS_A_EF.pdb'
esmfold.output_to_pdb(output, file=file)

A domain : VCVHQLFEQQIEKTPDAIAVIYENQTLSYAELNARANRLAHQLIALGVAPDQRVAICVTRSLARIIGLLAVLKAGGAYVPLDPAYPGERLAYMLTDATPVILMADNVGRAALSEDILATLTVLDPNTLLEQPDHNPQVSGLTPQHLAYVIYTSGSTGRPKGVMIEHRSVVNLTLTQITQFDVCATSRMLQFASFGFDASVWEIMMALSCGAMLVIPTETVRQDPQRLWRYLEEQAITHACLTPAMFHDGTDLPAIAIKPTLIFAGEAPSPALFQALCSRADLFNAYGPTEITVCATTWDCPADYTGGVIPIGSPVANKRLYLLDEHRQPVPLGTVGELYIGGVGVARGYLNRPELTAERFLNDPFSDETNARMYRAGDLARYLPDGNLVFVGRNDQQVKIRGFRIEPGEIEARLVEHSEVSEALVLALGDGQDKRLVAYVVALADDGLATKLREHLSDILPDYMIPAAFVRLDAFPLTPNGKLDRRSLP


In [13]:
## save pdb C domain
print(f'C domain : {protein.get_residues(C)}')
output = esmfold.get_structure(protein.get_residues(C))
file = '/nethome/kgeorge/workspace/DomainPrediction/Data/GxpS_C_EF.pdb'
esmfold.output_to_pdb(output, file=file)

C domain : QAEIDRIVEQVPGGIANIQDIYALSPLQDGILFHHLLANEGDPYLLITQQAFADRPLLNRYLAAVQQVVDRHDILRTAFIWEGLSVPAQVICRQAPLSVTELTLNPADGAISNQLAQRFDPRRHRIDLNQAPLLRFVVAQESDGRWILLQLLHHLIGDHTTLEVMNSEVQACLLGQMDSLPAPVPFRHLVAQARQGVSQAEHTRFFTDMLAEVDEPTLLFGLAEAHHDGSQVTESHRMLTAGLNERLRGQARRLGVSVAALCHLAWAQVLSRTSGQTQVVFGTVLFGRMQAGEGSDSGMGLFINTLPLRLDIDNTPVRDSVRAAHSRLAGLLEHEHASLALAQRCSGVESGTPLFNALLNYRHNTQPVTPDEIVSGIEFLGAQERTNYPFVLSVEDSGSDLGLTAQVVQPFDPERICGYMQQALASLVQA


In [14]:
pdb_6mfw_seq = "QQVEMTPDHVAVVDRGQSLTYKQLNERANQLAHHLRGKGVKPDDQVAIMLDKSLDMIVSILAVMKAGGAYVPIDPDYPGERIAYMLADSSAAILLTNALHEEKANGACDIIDVHDPDSYSENTNNLPHVNRPDDLVYVMYTSGSTGLAKGVMIEHHNLVNFCEWYRPYFGVTPADKALVYSSFSFDGSALDIFTHLLAGAALHIVPSERKYDLDALNDYCNQEGITISYLPTGAAEQFMQMDNQSFRVVITGGDVLKKIERNGTYKLYNGYGPTECTIMVTMFEVDKPYANIPIGKPIDRTRILILDEALALQPIGVAGELFIVGEGLGRGYLNRPELTAEKFIVHPQTGERMYRTGDRARFLPDGNIEFLGRLDNLVKIRGYRIEPGEIEPFLMNHPLIELTTVLAKEQADGRKYLVGYYVAPEEIPHGELREWLGNDLPDYMIPTYFVHMKAFPLTANGKVDRRALPDVQADAELLGEDYVAPTDELEQQLAQVWSHVLGIPQMGIDDHFLERGGDSIKVMQLIHQLKNIGLSLRYDQLFTHPTIRQLKRLLTEQKQVSLEPLRELDEQAEYETSAVEKRMYIIQQQDVESIAYNVVYTINFPLTVDTEQIRVALEQLVLRHEGLRSTYHMRGDEIVKRIVPRAELSFVRQTGEEESVQSLLAEQIKPFDLAKAPLLRAGVIETADKKVLWFDSHHILLDGLSKSILARELQALLGQQVLSPVEKTYKSFARWQNEWFASDEYEQQIAYWKTLLQGELPAVQLPTKKRPPQLTFDGAIQMYRVNPEITRKLKATAAKHDLTLYMLMLTIVSIWLSKMNSDSNQVILGTVTDGRQHPDTRELLGMFVNTLPLLLSIDHEESFLHNLQQVKAKLLPALQNQYVPFDKILEAARVKREGNRHPLFDVMFMMQGAPETELESNMHHINAGISKFDLTLEVLERENGLNIVFEYNTHLFDEGMILRMVAQFEHLLLQAVHGLDQQVKRFELV"


In [23]:
Acore = [i for i in range(0,333)]
Asub = [i for i in range(388,462)]
A = [i for i in range(0,462)]
C = [i for i in range(569,989)]
T = [i for i in range(490,554)]

In [25]:
## save pdb A domain
print(f"A domain : {''.join([pdb_6mfw_seq[i] for i in A])}")
output = esmfold.get_structure(''.join([pdb_6mfw_seq[i] for i in A]))
file = '/nethome/kgeorge/workspace/DomainPrediction/Data/6mfw_A_EF.pdb'
esmfold.output_to_pdb(output, file=file)

A domain : QQVEMTPDHVAVVDRGQSLTYKQLNERANQLAHHLRGKGVKPDDQVAIMLDKSLDMIVSILAVMKAGGAYVPIDPDYPGERIAYMLADSSAAILLTNALHEEKANGACDIIDVHDPDSYSENTNNLPHVNRPDDLVYVMYTSGSTGLAKGVMIEHHNLVNFCEWYRPYFGVTPADKALVYSSFSFDGSALDIFTHLLAGAALHIVPSERKYDLDALNDYCNQEGITISYLPTGAAEQFMQMDNQSFRVVITGGDVLKKIERNGTYKLYNGYGPTECTIMVTMFEVDKPYANIPIGKPIDRTRILILDEALALQPIGVAGELFIVGEGLGRGYLNRPELTAEKFIVHPQTGERMYRTGDRARFLPDGNIEFLGRLDNLVKIRGYRIEPGEIEPFLMNHPLIELTTVLAKEQADGRKYLVGYYVAPEEIPHGELREWLGNDLPDYMIPTYFVHMKAFPLTANGK


In [26]:
## save pdb C domain
print(f"C domain : {''.join([pdb_6mfw_seq[i] for i in C])}")
output = esmfold.get_structure(''.join([pdb_6mfw_seq[i] for i in C]))
file = '/nethome/kgeorge/workspace/DomainPrediction/Data/6mfw_C_EF.pdb'
esmfold.output_to_pdb(output, file=file)

C domain : EQAEYETSAVEKRMYIIQQQDVESIAYNVVYTINFPLTVDTEQIRVALEQLVLRHEGLRSTYHMRGDEIVKRIVPRAELSFVRQTGEEESVQSLLAEQIKPFDLAKAPLLRAGVIETADKKVLWFDSHHILLDGLSKSILARELQALLGQQVLSPVEKTYKSFARWQNEWFASDEYEQQIAYWKTLLQGELPAVQLPTKKRPPQLTFDGAIQMYRVNPEITRKLKATAAKHDLTLYMLMLTIVSIWLSKMNSDSNQVILGTVTDGRQHPDTRELLGMFVNTLPLLLSIDHEESFLHNLQQVKAKLLPALQNQYVPFDKILEAARVKREGNRHPLFDVMFMMQGAPETELESNMHHINAGISKFDLTLEVLERENGLNIVFEYNTHLFDEGMILRMVAQFEHLLLQAVHGLDQQVKRFELV
