In [2]:
from huggingface_hub import login
from esm.models.esm3 import ESM3
import esm


In [3]:
from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    ESMProteinError,
    ESMProteinTensor,
    GenerationConfig,
    LogitsConfig,
    LogitsOutput,
    SamplingConfig,
    SamplingTrackConfig,
)
from esm.tokenization.function_tokenizer import (
    InterProQuantizedTokenizer as EsmFunctionTokenizer,
)
from esm.utils.structure.protein_chain import ProteinChain
from esm.utils.types import FunctionAnnotation

## Add Immune Molecule InterPro Labels

In [4]:
labels = ["IPR036179",
"IPR013106",
"IPR013106",
"IPR003599",
"IPR050199",
"IPR007110",
"IPR013783"]  # from InterPro 4krl entry


In [5]:
labels  = [i for i in labels if i in EsmFunctionTokenizer().interpro_labels]

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

In [7]:
labels[-1]  # Ig like fold

'IPR013783'

In [51]:
#model: ESM3InferenceClient = ESM3.from_pretrained("esm3-open").to("cuda") # or "cpu"
model: ESM3InferenceClient = esm.sdk.client("esm3-large-2024-03", token="")


results = []

for label in [labels[-1]]:  # Ig like fold
    for t in [0.7,1, 2]:

        protein_chain = ProteinChain.from_pdb("4krl_chain_a.pdb")
        protein = ESMProtein.from_protein_chain(protein_chain)
        protein.sequence = "_"*122
        protein.function_annotations = [
                # Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/
                #FunctionAnnotation(label="Immunoglobulin-like domain", start=1, end=122),
                FunctionAnnotation(label=label, start=1, end=122),
            ]
        protein_result = model.generate(protein, GenerationConfig(track="sequence", num_steps=8, temperature=t))
    results.append(protein_result)

Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in generate: <html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n</body>\r\n</html>\r\n')


In [61]:
rl = []
for r in results:
    try:
        rl.append({"sequence": r.sequence, "InterPro_label":r.function_annotations})
    except:
        pass
import pandas as pd
result_df = pd.DataFrame(rl)

In [64]:
result_df

Unnamed: 0,sequence,InterPro_label
0,PVHLEESGGGTVLAGATSVLSCHGSGWIYENYGLGWFRQAAGQKTE...,"[FunctionAnnotation(label='IPR036179', start=1..."
1,PVELIERGGGTVGFGSELPLSCTASGAMFHHIGDTWFRQMPGRSRE...,"[FunctionAnnotation(label='IPR013106', start=1..."
2,QEQLLEFGGGREELGGSLRLSCASSGVTFHNYGMGWFRQAPGKGCM...,"[FunctionAnnotation(label='IPR003599', start=1..."
3,DMKLIESGGGFVARGGSLSLSCAASGLIRSGYGMGWFREATGKERG...,"[FunctionAnnotation(label='IPR013783', start=1..."


In [None]:
sequence = "QVKLEESGGGSVQTGGSLRLTCAASGRTSRSYGMGWFRQAPGKEREFVSGISWRGDSTGYADSVKGRFTISRDNAKNTVDLQMNSLKPEDTAIYYCAAAAGSAWYGTLYEYDYWGQGTQVTV"
#Mask the sequence
list_seq = list(sequence)
list_seq[27:36] = "_"*9
list_seq[50:58] = "_"*8
list_seq[99:114] = "_"*15
masked_seq = "".join(list_seq)

In [None]:
protein_chain = ProteinChain.from_pdb("4krl_chain_a.pdb")
protein = ESMProtein.from_protein_chain(protein_chain)
protein.sequence = masked_seq
protein_result = model.generate(protein, GenerationConfig(track="sequence", num_steps=8, temperature=0.7))

In [None]:
temps = [2]*2+ [3]*2 + [0.7]*2 + [1]*2

In [None]:
results2 = [model.generate(protein, GenerationConfig(track="sequence", num_steps=8, temperature=t)) for t in temps]