In [1]:
import pandas as pd

Foldseek installation (in any environment)

In [None]:
!conda install -c conda-forge -c bioconda foldseek

In [2]:
CARE_directory = '/disk1/jyang4/repos/CARE'

### Download structures from the AF database corresponding to sequences in the training set for task 1

In [6]:
swissprot = pd.read_csv(f'{CARE_directory}/splits/task1/protein_train.csv')
#to save time, could cluster so there are fewer structures to search against
swissprot

Unnamed: 0.1,Unnamed: 0,Entry,Entry Name,Sequence,EC number,Length,EC All,clusterRes30,clusterRes50,clusterRes70,clusterRes90,EC3,EC2,EC1
0,0,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,3.2.2.6,269,3.2.2.6,A1AY86,A0A009IHW8,A0A009IHW8,A0A009IHW8,3.2.2,3.2,3
1,1,A0A023I7E1,ENG1_RHIMI,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,3.2.1.39,796,3.2.1.39,D4AZ24,A0A023I7E1,A0A023I7E1,A0A023I7E1,3.2.1,3.2,3
2,2,A0A024SC78,CUTI1_HYPJR,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,3.1.1.74,248,3.1.1.74,A8QPD8,A8QPD8,A8QPD8,A0A024SC78,3.1.1,3.1,3
3,3,A0A024SH76,GUX2_HYPJR,MIVGILTTLATLATLAASVPLEERQACSSVWGQCGGQNWSGPTCCA...,3.2.1.91,471,3.2.1.91,B2AE04,A1CCN4,A0A024SH76,A0A024SH76,3.2.1,3.2,3
4,4,A0A044RE18,BLI_ONCVO,MYWQLVRILVLFDCLQKILAIEHDSICIADVDDACPEPSHTVMRLR...,3.4.21.75,693,3.4.21.75,Q9VBC7,A0A044RE18,A0A044RE18,A0A044RE18,3.4.21,3.4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184524,185990,Q05115,AMDA_BORBO,MQQASTPTIGMIVPPAAGLVPADGARLYPDLPFIASGLGLGSVTPE...,4.1.1.76,240,4.1.1.76,Q05115,Q05115,Q05115,Q05115,4.1.1,4.1,4
184525,185991,Q6HX62,Y3032_BACAN,MGQNQFRWSNEQLREHVEIIDGTRSPHKLLKNATYLNSYIREWMQA...,3.5.4.2,584,3.5.4.2,Q9KF49,Q6HX62,Q6HX62,Q6HX62,3.5.4,3.5,3
184526,185992,Q6L032,Y1085_PICTO,MLLKNIKISNDYNIFMIIASRKPSLKDIYKIIKVSKFDEPADLIIE...,3.5.4.2,573,3.5.4.2,Q9KF49,Q6L032,Q6L032,Q6L032,3.5.4,3.5,3
184527,185993,Q94MV8,VG56_BPLZ5,MAHFNECAHLIEGVDKANRAYAENIMHNIDPLQVMLDMQRHLQIRL...,3.6.1.12,172,3.6.1.12,P39262,P39262,Q94MV8,Q94MV8,3.6.1,3.6,3


In [None]:
uniprot_ids = swissprot['Entry'].unique()
file_paths = [f"gs://public-datasets-deepmind-alphafold-v4/AF-{u}-F1-model_v4.cif" for u in uniprot_ids]
output_file = 'uniprot_cif_paths.txt' 

with open(output_file, 'w') as file:
    file.write('\n'.join(file_paths))

In [None]:
!cat uniprot_cif_paths.txt | gsutil -m cp -I reference_structures

In [4]:
datasets = ["30_protein_test", "30-50_protein_test", "price_protein_test", "promiscuous_protein_test"]

for dataset in datasets:
    df = pd.read_csv(f'{CARE_directory}/splits/task1/{dataset}.csv')

    uniprot_ids = df['Entry'].unique()
    file_paths = [f"gs://public-datasets-deepmind-alphafold-v4/AF-{u}-F1-model_v4.cif" for u in uniprot_ids]
    output_file = dataset + '/uniprot_cif_paths.txt' 

    with open(output_file, 'w') as file:
        file.write('\n'.join(file_paths))

In [None]:
!cat uniprot_cif_paths.txt | gsutil -m cp -I structures

Fold the sequences with ESMFold

In [3]:
import torch
from transformers import AutoTokenizer, EsmForProteinFolding
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
        
    return pdbs

In [4]:
device = torch.device("cuda")
tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1", cache_dir="/disk1/jyang4/repos/ProCALM/data/pretrained_ESMFold")
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", cache_dir="/disk1/jyang4/repos/ProCALM/data/pretrained_ESMFold", low_cpu_mem_usage=True).to(device)

#use the tips from the tutorial to speed up calculation and reduces memory usage
model.esm = model.esm.half()
torch.backends.cuda.matmul.allow_tf32 = True
#self.model.trunk.set_chunk_size(64)

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import os
datasets = ["30_protein_test", "30-50_protein_test", "promiscuous_protein_test"] #"price_protein_test"
#datasets = ["price_protein_test"]

In [16]:
for dataset in datasets:
    df = pd.read_csv(f'{CARE_directory}/splits/task1/{dataset}.csv')

    #only predict the structures that could not be loaded from the AF2 database
    AF_structures = os.listdir(f"{dataset}/structures")
    AF_entries = [entry.split("-")[1] for entry in AF_structures]
    all_entries = df['Entry'].values

    missing_entries = [entry for entry in all_entries if entry not in AF_entries]
    df = df[df['Entry'].isin(missing_entries)]
    sequences = df['Sequence'].values
    entries = df['Entry'].values

    for sequence, entry in zip(sequences, entries):
        print(entry)
        try:
            tokenized_input = tokenizer([sequence], return_tensors="pt", add_special_tokens=False)['input_ids'].to(device)
            with torch.no_grad():
                output = model(tokenized_input)
            
            pdb = convert_outputs_to_pdb(output)
            os.makedirs(f"{dataset}/structures", exist_ok=True)
            name = "ESM-" + entry + ".pdb"
            with open(f"{dataset}/structures/" + name, "w") as f:
                f.write("".join(pdb))
        except:
            pass

Q5UQI3
O55716
Q7T6Y9
Q9Z4Q7
Q6U6H1
Q196W4
Q6GZN7
P41469
F5HB62
Q6QGP4
Q8V3T6
P30314
P03697
Q5I143
P82678
Q5UQD1
Q5UQ96
O83759
P03627
P03631
C0HM52
P13065
Q196Z5
Q6GZT8
L7WGA7
P10447
P0DW60
P0DOK0
Q2HRB6
Q9J7Z0
Q715L4
A0A8V1ABE9
Q5NVA2
Q5PU49


### Generate a foldseek database

In [None]:
!foldseek createdb reference_structures/ reference_DB/reference_DB

### Loop through and search for proteins in the test set

Should be very fast (on the order of seconds to minutes)

In [None]:
!foldseek easy-search 30_protein_test/structures reference_DB/reference_DB 30_protein_test/aln tmpFolder --max-seqs 1

!foldseek easy-search 30-50_protein_test/structures reference_DB/reference_DB 30-50_protein_test/aln tmpFolder --max-seqs 1

!foldseek easy-search price_protein_test/structures reference_DB/reference_DB price_protein_test/aln tmpFolder --max-seqs 1

!foldseek easy-search promiscuous_protein_test/structures reference_DB/reference_DB promiscuous_protein_test/aln tmpFolder --max-seqs 1

## Process the alignment results and map to EC numbers

In [1]:
import os
import pandas as pd

In [2]:
names= ['price_protein_test', 'promiscuous_protein_test', '30_protein_test', '30-50_protein_test']

In [3]:
ref_withEC = pd.read_csv('../../processed_data/protein2EC.csv')
entry2EC = dict(zip(ref_withEC['Entry'], ref_withEC['EC All']))

In [4]:
def return_entry(entry):
    if '-' in entry:
        return entry.split('-')[1]
    else:
        return entry

In [5]:
os.makedirs('../results_summary/BLAST', exist_ok=True)

for name in names:

    df = pd.read_csv(f'../../splits/task1/{name}.csv')
    
    results = pd.read_csv(name + '/aln', sep='\t', header=None)
    
    results['Entry'] = results[0].apply(return_entry)
    results['Retrieved Entry'] = results[1].apply(return_entry)
    

In [6]:
results

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,Entry,Retrieved Entry
0,AF-O13819-F1-model_v4,AF-Q9N589-F1-model_v4,0.313,526,341,0,6,531,35,531,1.908000e-44,1549,O13819,Q9N589
1,AF-Q3Z8V0-F1-model_v4,AF-Q7VYQ1-F1-model_v4,0.331,179,114,0,2,173,15,193,2.722000e-16,571,Q3Z8V0,Q7VYQ1
2,AF-Q98PE5-F1-model_v4,AF-Q8EVK2-F1-model_v4,0.424,158,90,0,1,157,1,158,4.884000e-21,746,Q98PE5,Q8EVK2
3,AF-Q8MYF1-F1-model_v4,AF-Q9P7J8-F1-model_v4,0.448,342,185,0,120,456,222,563,1.394000e-38,1463,Q8MYF1,Q9P7J8
4,AF-P34329-F1-model_v4,AF-P38659-F1-model_v4,0.460,606,326,0,14,618,38,643,8.497000e-82,2505,P34329,P38659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,AF-I3R637-F1-model_v4,AF-Q42997-F1-model_v4,0.330,524,338,0,2,525,78,583,2.217000e-47,1563,I3R637,Q42997
550,AF-P47718-F1-model_v4,AF-P47298-F1-model_v4,0.377,122,75,0,8,129,9,129,5.949000e-16,595,P47718,P47298
551,AF-Q8D3C8-F1-model_v4,AF-Q6FCY1-F1-model_v4,0.443,431,240,0,1,431,1,431,1.325000e-55,2334,Q8D3C8,Q6FCY1
552,AF-Q44243-F1-model_v4,AF-Q5HLX6-F1-model_v4,0.322,412,273,0,25,436,8,410,2.015000e-47,1660,Q44243,Q5HLX6


In [7]:
os.makedirs('../results_summary/Foldseek', exist_ok=True)

for name in names:

    df = pd.read_csv(f'../../splits/task1/{name}.csv')
    
    results = pd.read_csv(name + '/aln', sep='\t', header=None)
    #turn indices into a column
    results['Entry'] = results[0].apply(return_entry)
    results['Retrieved Entry'] = results[1].apply(return_entry)
    
    #merge the two dataframes
    merged = pd.merge(df, results[['Entry', 'Retrieved Entry']], on='Entry', how='left')
    #map to predicted EC number
    merged['EC Predicted'] = merged['Retrieved Entry'].map(entry2EC)

    merged = merged.rename(columns={'EC Predicted': 0})
    merged[['Entry', 'EC number', 0]].to_csv(f'../results_summary/Foldseek/{name}_results_df.csv', index=False)

In [8]:
merged

Unnamed: 0,Entry,Entry Name,Sequence,EC number,Length,EC All,clusterRes30,clusterRes50,clusterRes70,clusterRes90,EC3,EC2,EC1,Retrieved Entry,0
0,Q0KBD2,DEND_CUPNH,MNVLITGGAGFLGLQLARLLLQRGTLNLDGQPVAIKRLTLLDVVAP...,1.1.1.410,324,1.1.1.410,P44094,Q0KBD2,Q0KBD2,Q0KBD2,1.1.1,1.1,1,P44094,1.1.1.410
1,D4GYH5,AGLM_HALVD,MELSIIGSGYVGTTIAACFAELGHDVVNVDIDEDIVASLNDGQAPI...,1.1.1.22,430,1.1.1.22,O34862,D4GYH5,D4GYH5,D4GYH5,1.1.1,1.1,1,P51585,1.1.1.132
2,P9WQC7,ADHB_MYCTU,MKTKGALIWEFNQPWSVEEIEIGDPRKDEVKIQMEAAGMCRSDHHL...,1.1.1.1,375,1.1.1.1,P80094,P9WQC7,P9WQC7,P9WQC7,1.1.1,1.1,1,P9WQB9,1.1.1.1
3,Q94AX4,DLD_ARATH,MAFASKFARSKTILSFLRPCRQLHSTPKSTGDVTVLSPVKGRRRLP...,1.1.2.4,567,1.1.2.4,Q12627,Q94AX4,Q94AX4,Q94AX4,1.1.2,1.1,1,Q86WU2,1.1.2.4
4,B8MKR3,CYB2_TALSN,MARVLDAAEVAKHNTPESCWVILYGKVYDVTEFISSHPGGVKVILR...,1.1.2.3,496,1.1.2.3,B8MKR3,B8MKR3,B8MKR3,B8MKR3,1.1.2,1.1,1,P32953,1.1.99.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,Q9WXX0,RBSA1_THEMA,MFPLLAFRGDRMEILKAKGIVKRFPGVVAVDNVDFEVYENEIVSLI...,7.5.2.7,520,7.5.2.7,A4WER4,Q9WXX0,Q9WXX0,Q9WXX0,7.5.2,7.5,7,Q0BG60,7.5.2.11; 7.5.2.7
556,Q831L8,TAGH_ENTFA,MEKELKVRTKLLTKEYSLAQTRIDKLKTLFSVFQNKVPTFWALKGV...,7.5.2.4,447,7.5.2.4,Q03SI5,Q831L8,Q831L8,Q831L8,7.5.2,7.5,7,Q03SI5,7.5.2.4
557,Q9C8T1,AB1I_ARATH,MSIRRPQIPRLLLQNVSCMRNAQQILRHVNVSLHDGGALVLTGTNG...,7.6.2.5,229,7.6.2.5,Q2G9A9,Q9C8T1,Q9C8T1,Q9C8T1,7.6.2,7.6,7,Q2SE49,7.6.2.5
558,Q0A808,CCMA_ALKEH,MGDTALHLVAAPALEATGLQVARGGRPLFRGLGFRLARGGLLCVRG...,7.6.2.5,236,7.6.2.5,Q2G9A9,Q0A808,Q0A808,Q0A808,7.6.2,7.6,7,Q5P3L0,7.6.2.5
