In [None]:
import pickle
import pandas as pd
from clipzyme import CLIPZyme
import numpy as np
import torch

For the CLIPZyme installation run this command instead:

python -m pip install -e .

1. Move all files from the `reformatted` folder to the CLIPZyme repo under `CLIPZyme/files`
2. Move this notebook to `CLIPZyme` in the CLIPZyme repo

### Retrieve the protein structures from the AF database

In [None]:
!cat uniprot_cif_paths.txt | gsutil -m cp -I files/AF_structures

### Extract protein representations
Only need to do this once for because the model will be the same for each split

In [4]:
path = 'results/pretrained/clipzyme_model/protein2EC_cluster50'

df = pd.read_csv('files/protein2EC_cluster50.csv')
length = len(df)
EClist = np.loadtxt("/disk1/jyang4/repos/CARE/processed_data/EC_list.txt", dtype=str)

embeddings = np.zeros((length, 1280))
#concatenate all embeddings
failed = []
for i in range(length):
    try:
        embeddings[i] = torch.load(f'{path}/sample_sample_{i}.protein.pt')
    except:
        print(f'{path}/sample_sample_{i}.protein.pt not found')
        failed.append(i)
        continue

results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_155.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_200.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_209.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_290.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_293.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_298.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_299.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_303.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_304.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sample_sample_305.protein.pt not found
results/pretrained/clipzyme_model/protein2EC_cluster50/sampl

In [5]:
print(len(failed))
df['index'] = df.index
#drop indices in failed
df = df.drop(failed)

522


In [6]:
ec2index = df.groupby('EC number')['index'].apply(list).to_frame().to_dict()['index']

cluster_centers = np.zeros((len(EClist), 1280))
for i, ec in enumerate(EClist):
    #average together the embeddings for each EC number
    try:
        indices = ec2index[ec]
        cluster_centers[i] = np.mean(embeddings[indices], axis=0)
    except:
        cluster_centers[i] = np.zeros(1280)

cluster_centers

array([[ 0.00695404,  0.00906996, -0.0018562 , ...,  0.0009524 ,
        -0.00348147,  0.01772444],
       [ 0.00653346,  0.0184211 , -0.03409347, ...,  0.00590318,
         0.00092498,  0.03337769],
       [ 0.01728218,  0.00888956, -0.01027402, ...,  0.00378615,
        -0.00412735,  0.02607958],
       ...,
       [ 0.0434782 , -0.02031397,  0.00936925, ..., -0.00888089,
        -0.00672255,  0.03892438],
       [ 0.00718313, -0.00829206,  0.02199208, ..., -0.00457703,
        -0.01323812, -0.00293109],
       [ 0.04028151, -0.0029525 ,  0.01158899, ..., -0.00164447,
        -0.00932373,  0.02943208]])

In [7]:
results = {}
results["protein_repr_array"] = cluster_centers
np.save('/disk1/jyang4/repos/CARE/task2_baselines/CLIPZyme/output/easy_split/representations/all_ECs_cluster_centers.npy', results)

### Extract reaction representations

In [3]:
for split in ['easy', 'medium', 'hard']:
    path = 'results/pretrained/clipzyme_model/' + split + 'reaction_test'

    df = pd.read_csv('files/{}.csv'.format(split + 'reaction_test'))
    length = len(df)

    embeddings = np.zeros((length, 1280))
    #concatenate all embeddings
    failed = []
    for i in range(length):
        try:
            embeddings[i] = torch.load(f'{path}/sample_sample_{i}.reaction.pt')
        except:
            print(f'{path}/sample_sample_{i}.reaction.pt not found')
            failed.append(i)
            continue
    
    results = {}
    results["reaction_repr_array"] = embeddings
    np.save('/disk1/jyang4/repos/CARE/task2_baselines/CLIPZyme/output/{}_split/representations/{}_reaction_test_representations.npy'.format(split, split), results)

results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_16.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_63.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_128.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_129.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_130.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_131.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_132.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_133.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_134.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_135.reaction.pt not found
results/pretrained/clipzyme_model/hard_reaction_test/sample_sample_136.r

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.01971458, 0.        , 0.        , ..., 0.04135529, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09832948, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.09652392]])