In [3]:
import pickle
import pandas as pd
import numpy as np
import torch
import os
from torch.utils.data import DataLoader
from clipzyme import CLIPZyme
from clipzyme import ReactionDataset

### Note: Run this from the CLIPZyme directory

Set the variable below as the absolute path to your CARE directory (ending in `/CARE`). Follow instructions to install CLIPZyme but instead install clipzyme package with `python -m pip install -e .`

For training, replacing `clipzyme/datasets/enzymemap.py` with the version provided in CARE. You can also remove the validation dataloader in `scripts/main.py` to speed things up.

In [4]:
CARE_directory = '/disk1/jyang4/repos/CARE'

# Step 0: Preparation

In [None]:
swissprot = pd.read_csv(f'{CARE_directory}processed_data/protein2EC_clustered50.csv')
uniprot_ids = swissprot['Entry'].unique()
file_paths = [f"gs://public-datasets-deepmind-alphafold-v4/AF-{u}-F1-model_v4.cif" for u in uniprot_ids]
output_file = 'uniprot_cif_paths.txt' 

with open(output_file, 'w') as file:
    file.write('\n'.join(file_paths))

Download the AF2 database structures from google cloud

In [None]:
!cat uniprot_cif_paths.txt | gsutil -m cp -I files/AF_structures

 # Step 1: Training

 Move the configs from the CARE repo to `configs/train/` in the CLIPZyme repo. Train three different models for each of the different splits.

In [None]:
!python scripts/dispatcher.py -c configs/train/CARE_clip_egnn_easy.json -l ./logs/

# Step 2: Inference

### Convert CARE splits to the format for CLIPZyme inference

In [3]:
def process_reaction(reaction):
    reactants, products = reaction.split(">>")
    reactants = reactants.split(".")
    products = products.split(".")

    reactants = [r for r in reactants if r != "[H+]"]
    products = [p for p in products if p != "[H+]"]

    reaction_string = "{}>>{}".format(".".join(reactants), ".".join(products))
    return reaction_string

In [5]:
for dataset in ["easy_reaction_test", "medium_reaction_test", "hard_reaction_test"]:

    df = pd.read_csv(f'{CARE_directory}/splits/task2/{dataset}.csv')
    df.rename(columns={"Mapped Reaction": "reaction"}, inplace=True)
    df["reaction"] = df["reaction"].apply(process_reaction)
    df = pd.DataFrame(df["reaction"])

    #filler values
    df["sequence"] = "MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR"
    df["protein_id"] = "A0A009IHW8"
    df["cif"] = "files/AF_structures/AF-A0A009IHW8-F1-model_v4.cif"

    df.to_csv(f'files/{dataset}.csv', index=False)

In [33]:
df = pd.read_csv(f"{CARE_directory}/processed_data/protein2EC_clustered50.csv")
df.rename(columns={"Sequence": "sequence", "Entry": "protein_id"}, inplace=True)
df["cif"] = df["protein_id"].apply(lambda x: f"files/AF_structures/AF-{x}-F1-model_v4.cif")

df["reaction"] = "[CH2:1]=[CH:2][CH2:3][N:4]1[CH2:5][CH2:6][C@:7]23[c:8]4[c:9]5[cH:10][cH:11][c:12]([OH:13])[c:14]4[O:15][C@H:16]2[C@@H:17]([OH:18])[CH2:19][CH2:20][C@@:21]3([OH:22])[C@H:23]1[CH2:24]5.[NH2:25][C:26](=[O:27])[c:28]1[cH:29][cH:30][cH:31][n+:32]([C@@H:33]2[O:34][C@H:35]([CH2:36][O:37][P:38](=[O:39])([OH:40])[O:41][P:42](=[O:43])([OH:44])[O:45][CH2:46][C@H:47]3[O:48][C@@H:49]([n:50]4[cH:51][n:52][c:53]5[c:54]([NH2:55])[n:56][cH:57][n:58][c:59]45)[C@H:60]([O:61][P:62](=[O:63])([OH:64])[OH:65])[C@@H:66]3[OH:67])[C@@H:68]([OH:69])[C@H:70]2[OH:71])[cH:72]1>>[CH2:1]=[CH:2][CH2:3][N:4]1[CH2:5][CH2:6][C@:7]23[c:8]4[c:9]5[cH:10][cH:11][c:12]([OH:13])[c:14]4[O:15][C@H:16]2[C:17](=[O:18])[CH2:19][CH2:20][C@@:21]3([OH:22])[C@H:23]1[CH2:24]5.[NH2:25][C:26](=[O:27])[C:28]1=[CH:72][N:32]([C@@H:33]2[O:34][C@H:35]([CH2:36][O:37][P:38](=[O:39])([OH:40])[O:41][P:42](=[O:43])([OH:44])[O:45][CH2:46][C@H:47]3[O:48][C@@H:49]([n:50]4[cH:51][n:52][c:53]5[c:54]([NH2:55])[n:56][cH:57][n:58][c:59]45)[C@H:60]([O:61][P:62](=[O:63])([OH:64])[OH:65])[C@@H:66]3[OH:67])[C@@H:68]([OH:69])[C@H:70]2[OH:71])[CH:31]=[CH:30][CH2:29]1"
df.to_csv("files/protein2EC_clustered50.csv", index=False)

### Run the dispatcher to extract representations for reactions and proteins
Move relevant configs from the CARE repo the the CLIPZyme repo.

In [None]:
#extract protein representations
!python scripts/dispatcher.py -c configs/screening_proteins_easy.json -l ./logs/screening/

In [None]:
#extract reaction representations
!python scripts/dispatcher.py -c configs/screening_reactions_easy.json -l ./logs/screening/

Some of the reactions and proteins will not be successfully loaded by the dataset. Just drop these for now.

### Compile the extracted representations

functions to process protein and reaction representations. Skip the missing embedding and merge the proteins by cluster center.

In [6]:
def process_proteins(split):
    path = 'results/CARE_protein/last/{}'.format(split)

    df = pd.read_csv('files/protein2EC_clustered50.csv')
    length = len(df)
    EClist = np.loadtxt(f"{CARE_directory}/processed_data/EC_list.txt", dtype=str)

    embeddings = np.zeros((length, 1280))
    #concatenate all embeddings
    failed = []
    for i in range(length):
        try:
            embeddings[i] = torch.load(f'{path}/sample_sample_{i}.protein.pt')
        except:
            #print(f'{path}/sample_sample_{i}.protein.pt not found')
            failed.append(i)
            continue
    
    #print(len(failed))
    df['index'] = df.index
    #drop indices in failed
    df = df.drop(failed)

    ec2index = df.groupby('EC number')['index'].apply(list).to_frame().to_dict()['index']

    cluster_centers = np.zeros((len(EClist), 1280))
    for i, ec in enumerate(EClist):
        #average together the embeddings for each EC number
        try:
            indices = ec2index[ec]
            cluster_centers[i] = np.mean(embeddings[indices], axis=0)
        except:
            cluster_centers[i] = np.zeros(1280)
    
    results = {}
    results["protein_repr_array"] = cluster_centers
    os.makedirs(f'{CARE_directory}/task2_baselines/CLIPZyme/output/{split}_split/representations', exist_ok=True)
    np.save(f'{CARE_directory}/task2_baselines/CLIPZyme/output/{split}_split/representations/all_ECs_cluster_centers.npy', results)
    
    return cluster_centers

def process_reactions(split):
    path = 'results/CARE_reaction/last/' + split + '_reaction_test'

    df = pd.read_csv('files/{}.csv'.format(split + '_reaction_test'))
    length = len(df)

    embeddings = np.zeros((length, 1280))
    #concatenate all embeddings
    failed = []
    for i in range(length):
        try:
            embeddings[i] = torch.load(f'{path}/sample_sample_{i}.reaction.pt')
        except:
            #print(f'{path}/sample_sample_{i}.reaction.pt not found')
            failed.append(i)
            continue
    
    results = {}
    results["reaction_repr_array"] = embeddings
    np.save(f'{CARE_directory}/task2_baselines/CLIPZyme/output/{split}_split/representations/{split}_reaction_test_representations.npy', results)

    return embeddings

In [7]:
for split in ['easy', 'medium', 'hard']: # 'medium', 'hard'
    cluster_centers = process_proteins(split)
    embeddings = process_reactions(split)

## Code below is only an example
### Alternative way to run inference (slower)

In [2]:
model = CLIPZyme(checkpoint_path="checkpoints/74d55ed2e3506862b41906157d03193c/last.ckpt").to("cuda:1")
model = model.eval()

In [34]:
## Create protein dataset
#-------------------------
protein_dataset = ReactionDataset(
  dataset_file_path = "files/protein2EC_clustered50.csv",
  esm_dir = "files/esm2_dir",
  protein_cache_dir = "files/AF_graphs", # optional, where to cache processed protein graphs
)
protein_dataloader = DataLoader(protein_dataset, batch_size=128)

Building dataset: 100%|██████████████████████████████████████| 29327/29327 [00:54<00:00, 539.51it/s]


In [None]:
for batch in protein_dataloader:
  protein_hiddens = model.extract_protein_features(batch) 

In [30]:
## Create reaction dataset
#-------------------------
reaction_dataset = ReactionDataset(
  dataset_file_path = "files/easy_reaction_test.csv",
  esm_dir = "files/esm2_dir",
  protein_cache_dir = "files/AF_graphs", # optional, where to cache processed protein graphs
)
reaction_dataloader = DataLoader(reaction_dataset, batch_size=128)

Building dataset: 100%|█████████████████████████████████████████| 393/393 [00:00<00:00, 1061.96it/s]


In [6]:
for batch in reaction_dataset:
  reaction_hiddens = model.extract_reaction_features(batch)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:1! (when checking argument for argument mat2 in method wrapper_CUDA_mm)