In [25]:
from emir.emir.estimators import KNIFEEstimator

ModuleNotFoundError: No module named 'knife_estimator'

In [12]:
import os

from models.moleculenet_models import GNN, GNN_graphpred
from data.moleculenet_encoding import mol_to_graph_data_obj_simple
import datamol as dm
import torch
from torch_geometric.data import DataLoader
import torch_geometric.nn.pool as tgp

MODEL_PARAMS = {
    "num_layer": 5,
    "emb_dim": 300,
    "JK": "last",
    "drop_ratio": 0.5,
    "gnn_type": "gin",
}

In [13]:
df = dm.data.freesolv()
dataloader = DataLoader(
    [mol_to_graph_data_obj_simple(dm.to_mol(smiles))for smiles in df["smiles"]],
    batch_size=32,
    shuffle=False
)



In [19]:
@torch.no_grad()
def get_embeddings_from_model(
        path:str = "backbone_pretrained_models/GROVER/grover.pth",
        pooling_method = tgp.global_mean_pool
):
    embeddings = []
    molecule_model = GNN(**MODEL_PARAMS)
    molecule_model.load_state_dict(torch.load(path))
    for b in dataloader:
        embeddings.append(
            pooling_method(molecule_model(b.x, b.edge_index, b.edge_attr), b.batch)
        )
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings

In [20]:
MODEL_PATH = "backbone_pretrained_models"
MODELS = {}
# For every directory in the folder
for model_name in os.listdir(MODEL_PATH):
    # For every file in the directory
    for file_name in os.listdir(os.path.join(MODEL_PATH, model_name)):
        # If the file is a .pth file
        if file_name.endswith(".pth"):
            MODELS[model_name] = os.path.join(MODEL_PATH, model_name, file_name)

In [21]:
embeddings = {}
for model_name, model_path in MODELS.items():
    embeddings[model_name] = get_embeddings_from_model(model_path)

In [23]:
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans import MoleculeTransformer
threeD_method_fpvec = ["usrcat", "electroshape", "usr"]
threeD_method_moleculetransf = ["cats3d",]
fpvec_method = ["ecfp-count", "ecfp",  "estate", "erg", "rdkit", "topological", "avalon", "maccs"]
moleculetransf_method = ["scaffoldkeys", "cats2d", ]
pharmac_method = ["cats", "default", "gobbi", "pmapper"]

for name in fpvec_method:
    transformer = FPVecTransformer(kind=name, dtype=float)
    embeddings[name] = transformer(df["smiles"])

In [24]:
embeddings

{'ContextPred': tensor([[ 0.0992, -0.0972,  0.0474,  ..., -0.0591, -0.1079, -0.0863],
         [ 0.0373, -0.1306,  0.0129,  ..., -0.1211, -0.0520,  0.0513],
         [ 0.0000, -0.0608, -0.0546,  ...,  0.3204,  0.1716,  0.0168],
         ...,
         [ 0.1591, -0.2408,  0.1733,  ..., -0.1686, -0.3030, -0.2356],
         [-0.4808,  0.4689, -0.4982,  ...,  0.6292,  0.4905,  0.1037],
         [ 0.1403, -0.2114,  0.1815,  ..., -0.7472, -0.1426, -0.7459]]),
 'GPT-GNN': tensor([[ 2.8532e-03,  1.9947e-03, -1.9738e-03,  ...,  2.8086e-03,
          -2.5471e-01, -4.1803e-03],
         [-5.4672e-03,  3.8277e-03,  3.3451e-03,  ...,  1.9955e-04,
           2.0333e-01, -1.4783e-03],
         [-4.0884e-03,  5.5009e-03, -1.0612e-03,  ...,  2.1441e-04,
           1.2011e+00, -3.7273e-03],
         ...,
         [-4.3468e-03, -3.5937e-03, -1.7544e-03,  ...,  6.6943e-04,
          -1.6718e+00, -1.8010e-03],
         [ 9.2165e-03,  5.3083e-03,  0.0000e+00,  ..., -1.7212e-03,
           3.8953e-01, -2.2919