After ATOMICA was trained for 3 epoch on predicting made up affinities, here we predict more affinities.

In [12]:
import numpy as np
import json
import os
import pandas as pd
import sys
import torch
from tqdm import tqdm

sys.path.append("..")

import models

from data.dataset import PDBBindBenchmark
from trainers.abs_trainer import Trainer
from models import AffinityPredictor

In [17]:
# Load the model configuration and weights
pretrain_config_path = "../model_weights/pretrain_model_config.json"
cktp_config_path = "../checkpoints/version_14/args.json"
weights_path = "../checkpoints/version_14/checkpoint/epoch35_step36.ckpt"
data_path = "../data/other/divergent_items.pkl"

# Load configuration
with open(pretrain_config_path, "r") as f:
    pretrain_config = json.load(f)


class Args:
    def __init__(self, cfg_path):
        with open(cfg_path, "r") as f:
            args = json.load(f)
        for key, value in args.items():
            setattr(self, key, value)


args = Args(cktp_config_path)

add_params = {
    "num_affinity_pred_layers": args.num_pred_layers,
    "affinity_pred_dropout": args.pred_dropout,
    "affinity_pred_hidden_size": args.pred_hidden_size,
    "num_projector_layers": args.num_projector_layers,
    "projector_dropout": args.projector_dropout,
    "projector_hidden_size": args.projector_hidden_size,
    "bottom_global_message_passing": args.bottom_global_message_passing,
    "global_message_passing": args.global_message_passing,
    "k_neighbors": args.k_neighbors,
    "dropout": args.dropout,
    "block_embedding_size": args.block_embedding_size,
    "block_embedding0_size": args.block_embedding0_size,
    "block_embedding1_size": args.block_embedding1_size,
}

if args.pred_nonlinearity == "relu":
    add_params["nonlinearity"] = torch.nn.ReLU()
elif args.pred_nonlinearity == "gelu":
    add_params["nonlinearity"] = torch.nn.GELU()
elif args.pred_nonlinearity == "elu":
    add_params["nonlinearity"] = torch.nn.ELU()
else:
    raise NotImplementedError(f"Nonlinearity {args.pred_nonlinearity} not implemented")

In [18]:
model = AffinityPredictor.load_from_pretrained(weights_path, **add_params)

  pretrained_model: DenoisePretrainModel = torch.load(pretrain_ckpt, map_location="cpu")


Pretrained model params: hidden_size=32,
               edge_size=32, k_neighbors=4, 
               n_layers=4, bottom_global_message_passing=False,
               global_message_passing=True, 
               fragmentation_method=PS_300




In [19]:
print(model)

AffinityPredictor(
  (mse_loss): MSELoss()
  (block_embedding): BlockEmbedding(
    (block_embedding): Embedding(440, 32)
    (atom_embedding): Embedding(121, 32)
  )
  (edge_embedding_bottom): Embedding(4, 32)
  (edge_embedding_top): Embedding(4, 32)
  (encoder): ATOMICAEncoder(
    (encoder): InteractionModule(
      (edge_embedder): Sequential(
        (0): GaussianEmbedding()
        (1): Linear(in_features=32, out_features=32, bias=True)
        (2): ReLU()
        (3): Dropout(p=0.0, inplace=False)
        (4): Linear(in_features=32, out_features=32, bias=True)
      )
      (layers): ModuleList(
        (0): TensorProductConvLayer(
          (tp): FullyConnectedTensorProduct(32x0e x 1x0e+1x1o+1x2e -> 32x0e+16x1o+16x2e | 2048 paths | 2048 weights)
          (fc): Sequential(
            (0): Linear(in_features=128, out_features=128, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.0, inplace=False)
            (3): Linear(in_features=128, out_features=2048, bias=Tr

In [20]:
print(f"{sum(p.numel() for p in model.parameters())} parameters in total")
print(f"{sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters in total")

7458017 parameters in total
7439809 trainable parameters in total


In [21]:
dataset = PDBBindBenchmark(data_path)

In [22]:
# unpickle file "/home/sascha/data/Projects/affinity_project/affinity_predictor/data/05_model_input/test_items.pkl"
with open("../data/other/divergent_items.pkl", "rb") as f:
    test_items = pd.read_pickle(f)

print(test_items[0].get("affinity"))
print(test_items[1].get("affinity"))

{'value': 71.19000000000003, 'neglog_aff': 1.0}
{'value': 60.60000000000002, 'neglog_aff': 0.0}


In [33]:
# create empty df with id, true and predicted affinity
output_df = pd.DataFrame(columns=["id", "true_neglog", "predicted_neglog"])

# Iterate through test items and populate the DataFrame
for item in tqdm(test_items):
    item_id = item.get("id").upper()
    item_id = item_id.replace("_A_BC", "")
    true_neglog = item.get("affinity").get("neglog_aff")

    output_df.loc[(len(output_df))] = {
        "id": item_id,
        "true_neglog": true_neglog,
        "predicted_neglog": None,
    }

output_df

100%|██████████| 2/2 [00:00<00:00, 707.78it/s]


Unnamed: 0,id,true_neglog,predicted_neglog
0,AAAAAAAAAAAAAAAAAAAAA,1.0,
1,GGGGGGGGGGGGGGGGGGGGG,0.0,


In [34]:
import csv

batch_size = 1  # Adjust batch size as needed

output_path = "affinity_predictions.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


for i in range(0, len(dataset), batch_size):
    batch = PDBBindBenchmark.collate_fn([dataset[j] for j in range(i, min(i + batch_size, len(dataset)))])
    batch = Trainer.to_device(batch, device)
    if "label" not in batch:
        with torch.no_grad():
            dummy_pred = model.infer({**batch, "label": torch.zeros(1)})
        label_shape = dummy_pred[1].shape if isinstance(dummy_pred, tuple) else dummy_pred.shape
        batch["label"] = torch.zeros(label_shape)

    with torch.no_grad():
        prediction = model.infer(batch)
    # Get IDs and predictions for the whole batch
    pdb_ids = batch.get("id", [f"sample_{i+j}" for j in range(len(batch["label"]))])
    pred_values = prediction[1] if isinstance(prediction, tuple) else prediction
    pred_values = pred_values.cpu().numpy().flatten()
    # revert -log transformation if necessary
    affinity = np.exp(-pred_values)
    # Update the DataFrame with predictions
    output_df.loc[i, "predicted_neglog"] = pred_values[0]

    del batch, prediction
    torch.cuda.empty_cache()

tensor([0.9436], device='cuda:0')
tensor([-0.0708], device='cuda:0')


In [35]:
# print output df where predicted_affinity is not None
output_df

Unnamed: 0,id,true_neglog,predicted_neglog
0,AAAAAAAAAAAAAAAAAAAAA,1.0,0.943614
1,GGGGGGGGGGGGGGGGGGGGG,0.0,-0.07078


In [36]:
# calculate spearman correlation betwen true_affinity and predicted_affinity
from scipy.stats import spearmanr

spearman_corr = spearmanr(output_df["true_neglog"], output_df["predicted_neglog"])
print(f"Spearman correlation: {spearman_corr.correlation}, p-value: {spearman_corr.pvalue}")

Spearman correlation: 0.9999999999999999, p-value: nan


In [37]:
# save output_df to csv
output_df.to_csv(output_path, index=False)