In [None]:
# mamba create -n esmfold python=3.10 ; mamba activate esmfold
# ! pip install --upgrade transformers py3Dmol accelerate

In [8]:
from transformers import AutoTokenizer, EsmForProteinFolding

tokenizer = AutoTokenizer.from_pretrained(
    "facebook/esmfold_v1",
    cache_dir = "/central/groups/MazmanianLab/joeB/cache"
    )
model = EsmForProteinFolding.from_pretrained(
    "facebook/esmfold_v1", 
    cache_dir = "/central/groups/MazmanianLab/joeB/cache",
    low_cpu_mem_usage=True
    )


  from .autonotebook import tqdm as notebook_tqdm
Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Folding a single chain

In [13]:
test_protein = "MKKRKKLFTCLLSLTLIFTMMPLTAKPANAAVNLTALYIIDSQGGTSEAVDVNQETIVDDSGGWKWEKASSTLTLDGFDGEYIEANGDLNIVVKGSNTVTIPANPDKNHVYGINITNGELKITGEGTTPSLTVTQTGFSKENLSVDGIDGRDGLVVTDCEIKIRFDGTEVYAGTGILSSNGRFELNGTASLDIELKNGQEYTRGIGRGVDASTSGDISISVDGAGEKCFGVGGLSASGSGNVYIRVPKGRAIDGSLVISEGAGNIEFEGFLYLDSYEDSAFSRKSSFEIPGNKKIIKVDASGSEVAGKCGFIYKEFTNLDYGVYLLDEDGNKVAKGKIITQANNPLTFMHSDVLDMGTLEVGTSYRGRKFHGLVSGGKAPYTFTVENLPDGLNLFQHNNYIDFFAYIAGEPTTPGPGGIITLTVTDANNMSDSISIVYDGVVKPPKYITVGEDKFEDSQNMTPTTGNWSYEAETKTLTLNSYNGGIIKSEEGLNIKVKGNNTITIPANPPATVCGINTESGILTITGEGSSPTLNILQTNLSREGSLYATGIDGRDGLEITDCSVKIKLNGAQQYDANGIGSSNGDFYLNGMASLDIDIKNAGDYSYGIGRGVRANTSGNIVVKVDGPGNPIIGISALYTLGSGDINISVPKGRAIGGPINISENAGTIYFDGSLKISTSDEQIFTYSDRFTIPGNKKIVKVDGSGNEITGKCGFIYKEFTSRDDGVYLVDEVGNKVVKGKIKTQANNPLSFMKSDVLDIETLEVDKYYRGKKIHGLVSGGKAPYKFTAKDLPEGLKLVESSRPDEFYAYVAGTPTAEQPAGTFTLTVTDANNATASIPVQYGAVTVPKGVTGLTLNESELTLANGSTATLTATVIPDDATIKTVQWSSSDTSVAAVDDSGNIKTNAPGKTVITATTKQGNFSKSCTVYVKEDKPNATIDYQYETLVGLEINEEYRISGDGVNDTFIVTGISYPIPEAWLGKTLKLTKTNAESESNSDEQVLIIPARPAAPTGIGTVDASAYYTNDGKLTGLKIGMEYCVHGTQNWYSDVEGEVTGLKTGEYEIRVKETDSSFIGHPTFVTIGYKSLALADDTAYAIPEGVVETEIEEVDISKAVKGGRTPYVFSKTSGPDWLQVDGQGKITGTRPSTETAATTATIKVTDKDNTVQTLTITVGAVTKPKGVSMGGKVKSYNPSNPITIQLMQSGAEVYKTTIAAETGSGQVTQNFSFGTVMPGTYDLVVTKDAHLAYTIKNVVVGEAPIDLTTMTDKAYSTITLLCGDIDGNGYINSTDLGIILKGQNYGKPSNTAGVEPAADLDGNGYINSTDLGIVLQGQHYGKSAVSVDYA"
tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids']

Running Model

In [14]:
import torch

with torch.no_grad():
    output = model(tokenized_input)

Function to convert results into PDB format

In [9]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [16]:
pdb = convert_outputs_to_pdb(output)
pdb

['PARENT N/A\nATOM      1  N   MET A   1     -56.052 -50.812   4.920  1.00  0.64           N  \nATOM      2  CA  MET A   1     -56.459 -50.706   3.522  1.00  0.69           C  \nATOM      3  C   MET A   1     -55.312 -50.194   2.658  1.00  0.64           C  \nATOM      4  CB  MET A   1     -57.671 -49.783   3.383  1.00  0.59           C  \nATOM      5  O   MET A   1     -54.912 -49.034   2.773  1.00  0.58           O  \nATOM      6  CG  MET A   1     -58.997 -50.463   3.681  1.00  0.58           C  \nATOM      7  SD  MET A   1     -60.386 -49.271   3.810  1.00  0.51           S  \nATOM      8  CE  MET A   1     -60.679 -48.929   2.053  1.00  0.53           C  \nATOM      9  N   LYS A   2     -54.286 -50.946   1.872  1.00  0.72           N  \nATOM     10  CA  LYS A   2     -53.602 -52.162   1.441  1.00  0.73           C  \nATOM     11  C   LYS A   2     -52.520 -51.848   0.411  1.00  0.71           C  \nATOM     12  CB  LYS A   2     -54.602 -53.164   0.862  1.00  0.65           C  \nAT

In [33]:
with open("output_structure.pdb", "w") as f:
    f.write("".join(pdb))

In [17]:
import py3Dmol

view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
view.addModel("".join(pdb), 'pdb')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})

<py3Dmol.view at 0x7f59ed832b90>

In [35]:
# The plddt field is scaled from 0-1 on earlier versions of ESMFold but will be updated
# to match AlphaFold's scale of 0-100 in future versions.
# We check here so that this code will work on either:

if torch.max(output['plddt']) <= 1.0:
    vmin = 0.5
    vmax = 0.95
else:
    vmin = 50
    vmax = 95

view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min': vmin,'max': vmax}}})


<py3Dmol.view at 0x7f59ed832b90>

## Bulk Predictions

In [1]:
! pip install requests pandas tqdm

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/e3/59/35a2892bf09ded9c1bf3804461efe772836a5261ef5dfb4e264ce813ff99/pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3-py2.py3-none-any.whl (502 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.0.3 pytz-2023.3 tzdata-2023.3


In [2]:
import requests

uniprot_url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Csequence&format=tsv&query=%28%28taxonomy_id%3A83333%29%20AND%20%28reviewed%3Atrue%29%20AND%20%28length%3A%5B128%20TO%20512%5D%29%20AND%20%28cc_subunit%3Amonomer%29%29"

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


In [4]:
uniprot_request = requests.get(uniprot_url)

In [5]:
from io import BytesIO
import pandas

bio = BytesIO(uniprot_request.content)

df = pandas.read_csv(bio, compression='gzip', sep='\t')
df = df.dropna()  # Remove empty columns, just in case
df

Unnamed: 0,Entry,Sequence
0,P00393,MTTPLKKIVIVGGGAGGLEMATQLGHKLGRKKKAKITLVDRNHSHL...
1,P00811,MFKTTLCALLITASCSTFAAPQQINDIVHRTITPLIEQQKIPGMAV...
2,P00903,MILLIDNYDSFTWNLYQYFCELGADVLVKRNDALTLADIDALKPQK...
3,P00914,MTTHLVWFRQDLRLHDNLALAAACRNSSARVLALYIATPRQWATHN...
4,P00926,MENAKMNSLIAQYPLVKDLVALKETTWFNPGTTSLAEGLPYVGLTE...
...,...,...
293,C5A132,MSHPALTQLRALRYCKEIPALDPQLLDWLLLEDSMTKRFEQQGKTV...
294,P27862,MESWLIPAAPVTVVEEIKKSRFITMLAHTDGVEAAKAFVESVRAEH...
295,P34209,MNITPFPTLSPATIDAINVIGQWLAQDDFSGEVPYQADCVILAGNA...
296,P76116,MHLRHLFSSRLRGSLLLGSLLVVSSFSTQAAEEMLRKAVGKGAYEM...


In [6]:
df = df.iloc[:10]

Batch Tokenization

In [10]:
ecoli_tokenized = tokenizer(df.Sequence.tolist(), padding=False, add_special_tokens=False)['input_ids']


Looping over each sequence in table

In [12]:
import torch
from tqdm import tqdm

outputs = []

with torch.no_grad():
    for input_ids in tqdm(ecoli_tokenized):
        input_ids = torch.tensor(input_ids, device='cuda').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})

NameError: name 'torch' is not defined