In [None]:
!pip install torch torchvision
!pip install transformers
!pip install rdkit
!pip install selfies

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from scipy.spatial.distance import cdist, cosine
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
from rdkit import Chem

import logging
import random
from pathlib import Path
from typing import Dict, Optional, Tuple

import numpy as np
import pandas as pd
import selfies

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline

#any model weights from the link above will work here
model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
input = tokenizer.encode("Cc1cc(=O)[nH]c(=S)[nH]1", return_tensors="pt")
output  = model(input)
print(output[-1][-1])

tensor([[-2.4201, -1.6458, -2.8918,  ..., -1.0299, -1.1793, -3.9012],
        [-1.9452, -1.2956, -1.2577,  ..., -0.9928, -3.0398, -3.3492],
        [-2.4343, -1.0198, -3.4737,  ..., -1.7579, -0.1069, -3.0434],
        ...,
        [-2.8679, -1.6012, -1.9900,  ..., -1.5939, -0.1952, -2.5827],
        [-2.5229, -2.5303, -3.0652,  ..., -2.5504, -1.7058, -3.2212],
        [-2.2633, -2.4018, -2.4664,  ..., -1.0951, -1.1390, -2.8362]],
       grad_fn=<SelectBackward0>)


In [None]:
testmol = "c1cc2ccc3cccc4ccc(c1)c2c34"
testmol2 = "Cc1cc(=O)[nH]c(=S)[nH]1"

In [None]:
def get_pairwise_chemical_similarities(mol_start, mol_end):
    morgan_start = AllChem.GetMorganFingerprint(mol_start, 2)  # 2 is closest to ECFP
    morgan_end = AllChem.GetMorganFingerprint(mol_end, 2)  # 2 is closest to ECFP
    morgan_dice = DataStructs.DiceSimilarity(morgan_start, morgan_end)
    morgan_start = AllChem.GetMorganFingerprintAsBitVect(
        mol_start, 2
    )  # count is needed for Tanimoto
    morgan_end = AllChem.GetMorganFingerprintAsBitVect(
        mol_end, 2
    )  # count is needed for Tanimoto
    morgan_tanimoto = DataStructs.FingerprintSimilarity(morgan_start, morgan_end)
    rdkit_start = Chem.RDKFingerprint(mol_start)
    rdkit_end = Chem.RDKFingerprint(mol_end)
    rdkit_distance = DataStructs.FingerprintSimilarity(rdkit_start, rdkit_end)
    return morgan_dice, morgan_tanimoto, rdkit_distance



In [None]:
def compute_distances(start: pd.Series, end: pd.Series) -> Tuple[float, float]:
    euclid = cdist([start], [end], "euclid")
    manhattan = cdist([start], [end], "cityblock")
    cos = cosine(start, end)
    return euclid[0][0], manhattan[0][0], cos

In [None]:
def get_correlation(distances1, distances2, prefix: Optional[str] = None):
    pearson = pearsonr(distances1, distances2)
    spearman = spearmanr(distances1, distances2)
    pearson_string = f"{prefix} Pearson" if prefix else "Pearson"
    spearman_string = f"{prefix} Spearman" if prefix else "Spearman"
    return {pearson_string: pearson, spearman_string: spearman}

In [None]:
def canonize_smile(input_str: str, remove_identities: bool = True) -> str:
    """Canonize SMILES string
    Args:
        input_str (str): SMILES input string
    Returns:
        str: canonize SMILES string
    """
    mol = Chem.MolFromSmiles(input_str)
    if mol is None:
        return None
    if remove_identities:
        [a.SetAtomMapNum(0) for a in mol.GetAtoms()]
    return Chem.MolToSmiles(mol)

In [None]:
def sample_synonym(
    seed: int = None, min_size: int = 5, max_size: int = 30
) -> Tuple[str, str, str, str]:
    random.seed(seed)
    found_fitting = False
    alphabet = (
        selfies.get_semantic_robust_alphabet()
    )  # Gets the alphabet of robust symbols
    while not found_fitting:
        size = random.randint(min_size, max_size)
        rnd_selfies = "".join(random.sample(list(alphabet), size))
        rnd_smiles = selfies.decoder(rnd_selfies)
        rnd_smiles = rnd_smiles.replace("-1", "-").replace("+1", "+")
        canon_smiles = canonize_smile(rnd_smiles)
        rnd_selfies = selfies.encoder(rnd_smiles)
        canon_selfies = selfies.encoder(canon_smiles)
        if (canon_smiles != rnd_smiles) and (canon_selfies != rnd_selfies):
            found_fitting = True
    return rnd_smiles, canon_smiles, rnd_selfies, canon_selfies

In [None]:
def sample_random_molecules(
    amount: int = 1000, overcompensation_factor: int = 1.1
) -> pd.DataFrame:
    molecule_list = []
    SEED = 6217
    for seed in range(SEED, int(SEED + amount * overcompensation_factor)):
        molecule_list.append(sample_synonym(seed))
    df = pd.DataFrame(
        molecule_list,
        columns=["rnd_smiles", "canon_smiles", "rnd_selfies", "canon_selfies"],
    )
    df.drop_duplicates(["canon_smiles"], inplace=True)
    if df.shape[0] < amount:
        logging.info(
            f"Overcompensation factor of {overcompensation_factor} was not enough."
        )
        return sample_random_molecules(amount, overcompensation_factor * 1.1)
    return df.iloc[:amount]

In [None]:
def get_embeddings(tokenizer, SMILES_starts_nn, SMILES_ends_nn):
    SMILES_starts = np.array(SMILES_starts_nn)
    SMILES_ends = np.array(SMILES_ends_nn)
    mol_starts_tokenized = [tokenizer.encode(SMILES_start, return_tensors="pt") for SMILES_start in SMILES_starts]
    mol_ends_tokenized = [tokenizer.encode(SMILES_end, return_tensors="pt") for SMILES_end in SMILES_ends]
    mol_starts_out = [model(input)[-1][-1] for input in mol_starts_tokenized]
    mol_ends_out = [model(input)[-1][-1] for input in mol_ends_tokenized]
    #https://stackoverflow.com/questions/63323464/how-to-get-the-correct-embedding-from-roberta-transformers 
    #return mol_starts_out[-1][-1], mol_ends_out[-1][-1]
    return mol_starts_out, mol_ends_out

In [None]:
def get_chemical_similarities(SMILES_starts, SMILES_ends):
    SEED = 6217
    mol_starts = [Chem.MolFromSmiles(SMILES_start) for SMILES_start in SMILES_starts]
    mol_ends = [Chem.MolFromSmiles(SMILES_end) for SMILES_end in SMILES_ends]
    similarities = np.array(
                [
                    get_pairwise_chemical_similarities(mol_start, mol_end)
                    for (mol_start, mol_end) in zip(mol_starts, mol_ends)
                ])
    #np.concatenate(
        #[
            #np.array(
            #    [
            #        get_pairwise_chemical_similarities(mol_start, mol_end)
            #        for (mol_start, mol_end) in zip(mol_starts, mol_ends)
            #    ]
            #),
            #np.array(
            #    [
             #       get_pairwise_chemical_similarities(
             #           mol_start, mol_ends[SEED % len(mol_ends)]
             #       )
             #       for mol_start in mol_starts
            #    ]
           # ),
       # ]#,
        #axis=1,
    #)
    return similarities

MAIN

In [None]:
#sample random mols
rndm_mols = sample_random_molecules(1000)
#print(rndm_mols.loc[:,"rnd_smiles"])
#print(rndm_mols.loc[:531,"rnd_smiles"])
rndm_smiles = rndm_mols.loc[:,"rnd_smiles"]
#split SMILES in starts and ends
top = int(np.ceil(rndm_smiles.shape[0] / 2))
bottom = int(rndm_smiles.shape[0] - top)
smiles_starts = rndm_smiles.head(top)
smiles_ends = rndm_smiles.tail(bottom)

#get chemical similarities of SMILES
chemical_similarities = get_chemical_similarities(smiles_starts,smiles_ends)
morgan_dice = chemical_similarities[:, 0]
morgan_tanimoto = chemical_similarities[:, 1]
rdkit_distance = chemical_similarities[:, 2]
#print("sims:", sims)
print(len(morgan_dice))




500


In [None]:
#get model
model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
#input = tokenizer.encode("Cc1cc(=O)[nH]c(=S)[nH]1", return_tensors="pt")
#output  = model(input)
print("len smiles",len(smiles_starts))
#get their embeddings
smiles_starts_embs,smiles_ends_embs = get_embeddings(tokenizer,smiles_starts,smiles_ends)
#for sm in smiles_starts:
#  print(sm)
print("lenembs",len(smiles_starts_embs))
#turn to numpy arrays
smiles_starts_embs = [em.detach().numpy() for em in smiles_starts_embs]
smiles_ends_embs = [em.detach().numpy() for em in smiles_ends_embs]

#smiles_ends_embs = smiles_ends_embs.numpy()
#print("startslen",len(smiles_starts_embs))
#print("endslen",len(smiles_ends_embs))
#print("last token",len(smiles_starts_embs[0][-1]))

#compute pairwise distances of embeddings
#dist_test=compute_distances(smiles_starts_embs[-1],smiles_starts_embs[-1])
#print(dist_test)
#it=1
#for num in smiles_ends_embs:
#  print(it)
#  print("num",len(num[-1]))
distances = [compute_distances(emb_start[-1], emb_end[-1]) for (emb_start, emb_end) in zip(smiles_starts_embs, smiles_ends_embs)]
#print("distances",distances)
print("lendistances",len(distances))
distances_des= ['euclid', 'manhattan', 'cosine']
sims=[morgan_dice, morgan_tanimoto, rdkit_distance] 
sims_des=['morgan_dice', 'morgan_tanimoto', 'rdkit_distance'] 
iti=0
for sim in sims:
  for pos in range(len(distances_des)):
    #print(pos)
    print(f"correlation {sims_des[iti]} and {distances_des[pos]}")
    cor=get_correlation(sim,[distance[pos] for distance in distances])
    print(cor)
  iti+=1


len smiles 500
lenembs 500
lendistances 500
correlation morgan_dice and euclid
{'Pearson': PearsonRResult(statistic=-0.2005003353464174, pvalue=6.240839434236782e-06), 'Spearman': SignificanceResult(statistic=-0.2239168620727789, pvalue=4.219573115203927e-07)}
correlation morgan_dice and manhattan
{'Pearson': PearsonRResult(statistic=-0.21016856044986923, pvalue=2.12875379912294e-06), 'Spearman': SignificanceResult(statistic=-0.23193513689950032, pvalue=1.5633747350410637e-07)}
correlation morgan_dice and cosine
{'Pearson': PearsonRResult(statistic=-0.19315278143490988, pvalue=1.3661004444972695e-05), 'Spearman': SignificanceResult(statistic=-0.2243577734008039, pvalue=3.9991513858116745e-07)}
correlation morgan_tanimoto and euclid
{'Pearson': PearsonRResult(statistic=-0.2083041663796247, pvalue=2.6298669340162356e-06), 'Spearman': SignificanceResult(statistic=-0.23349190225743494, pvalue=1.2838154109773206e-07)}
correlation morgan_tanimoto and manhattan
{'Pearson': PearsonRResult(stat

Testing


In [None]:
input1 = tokenizer.encode("Cc1cc(=O)[nH]c(=S)[nH]1", return_tensors='pt')
output1  = model(input1)
print(output1[-1][-1])
input2 = tokenizer.encode("Cc1cc(=O)[nH]c(=S)[nH]1", return_tensors='pt')
output2  = model(input2)
print(output2[-1][-1])
conv = (output2[-1][-1]).detach().numpy()
print(pd.DataFrame(conv))
test = cdist(pd.DataFrame(conv),pd.DataFrame(conv),"euclid")
print("euclid cdist test",test)


starttensor = output1[-1][-1].detach().numpy()
#print(starttensor)
startpd = pd.DataFrame(starttensor)
#https://stackoverflow.com/questions/63323464/how-to-get-the-correct-embedding-from-roberta-transformers

end = output2[-1][-1]
euclid = cdist(starttensor, starttensor, "euclid")
print("euclid",euclid[0][0])
manhattan = cdist(starttensor, starttensor, "cityblock")
print("manha",manhattan[0][0])
cos = cdist(starttensor, starttensor, "cosine")
print("cos",cos[0][0])

In [None]:
#test
smil=["Cc1cc(=O)[nH]c(=S)[nH]1"]
smin = pd.DataFrame(smil)
smiin=np.array(smin.iloc[:1])
input2 = [tokenizer.encode(SMI, return_tensors="pt") for SMI in smiin]
#input2 = tokenizer.encode(smiin, return_tensors='pt')
output2  = model(input2)
conv = (output2[-1][-1]).detach().squeeze().numpy()
dist_test = compute_distances(conv,conv)
print(dist_test)