# Extension of ETH dataset dataframe to include:
- mapping atoms of mol-file to SMILES is correct
- SELFIES
- SELFIES mapping

In [1]:
import pandas as pd
from tokenisation import tokenize_dataset, get_tokenizer
from constants import (
    TOKENIZER_PATH
)
from typing import List
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ifender/miniconda3/envs/fairseq_git2/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
def clean_SMILES(SMILES_tok):
    """Cleaning of SMILES tokens input from hydrogens and digits

    Args:
        SMILES_tok (_list_): List of SMILES_tokens for a given SMILES

    Returns:
        _list,list_: Processed SMILES_token list and list of positions in input tokens list that were kept 
        (needed to distinguish which embeddings are relevant)
    """
    SMILES_tok_prep = list()
    struc_toks = r"()=:~1234567890#/\\"
    posToKeep = list()
    pos = 0
    for i in range(len(SMILES_tok)):
        # If token is bracketed, keep as is, even if contains H or other atoms or numbers
        if SMILES_tok[i].startswith("[") and SMILES_tok[i].endswith("]"):
            SMILES_tok_prep.append(SMILES_tok[i])
            posToKeep.append(pos)
        # when it's an H in the SMILES, ignore, cannot deal
        elif SMILES_tok[i] != "H" and SMILES_tok[i] != "h" and not SMILES_tok[i].isdigit() and not SMILES_tok[i].isspace():
            if not any(elem in struc_toks for elem in SMILES_tok[i]):
                if SMILES_tok[i] != "-":
                    SMILES_tok_prep.append(SMILES_tok[i])
                    # keep pos where you keep SMILES token
                    posToKeep.append(pos)
        pos += 1
    assert(len(posToKeep) == (len(SMILES_tok_prep))
           ), f"Length of positions-to-keep-array ({len(posToKeep)}) and length of SMILES_tok_prep ({len(SMILES_tok_prep)}) are not the same"
    print("SMILES_tok: ", SMILES_tok)
    print("posToKeep: ", posToKeep)
    print("SMILES_tok_prep: ", SMILES_tok_prep)

    return SMILES_tok_prep, posToKeep

def get_tokenized_SMILES(task_SMILES: List[str]):
    """Tokenize SMILES string

    Args:
        input_list of strings (str): List of SMILES input string

    Returns:
        dict: dictionary that links canonize SMILES string
    """
    tokenizer = get_tokenizer(TOKENIZER_PATH)
    print(f"tokenizer {tokenizer}")
    smi_toks = tokenize_dataset(tokenizer, task_SMILES, False)
    smi_toks = [smi_tok.split() for smi_tok in smi_toks]
    print(f"SMILES tokens: {smi_toks[0]}")
    smiles_dict = dict(zip(task_SMILES,smi_toks))
    return smiles_dict

In [3]:
# 
csv_data = "/data/jgut/SMILES_or_SELFIES/dash_dataset.csv"
df = pd.read_csv(csv_data)
#print(df.head())
# show first 10 lines
print(df.head(10))
df = df[:10]

   Unnamed: 0  atom_idx element  mulliken     resp1     resp2      dual  \
0           0         0       C -0.290919 -0.287521 -0.287521 -0.001670   
1           1         1       N  0.419352  0.458319  0.458319  0.195391   
2           2         2       O -0.397038 -0.535125 -0.535125 -0.351755   
3           3         0       C -0.387290 -0.449151 -0.449151  0.157883   
4           4         1       C  0.022985  0.052905  0.052905  0.220494   
5           5         2       C -0.261951 -0.473223 -0.473223 -0.052606   
6           6         3       C  0.001897  0.634123  0.631025 -0.002175   
7           7         4       N -0.591325 -1.196668 -1.196668 -0.486405   
8           8         0       C -0.387167 -0.462836 -0.462836  0.164734   
9           9         1       C  0.022232  0.069244  0.069244  0.222098   

   cnf_idx  mbis_dipole_strength     DASH_IDX            comp_key      SMILES  \
0        0              0.042085   QMUGS500_1   conf_00QMUGS500_1  C#[N+][O-]   
1        0  

In [4]:
print(len(df))

10


In [5]:
# add a column of the SMILES as tokenized SMILES to df
smiles_dict = get_tokenized_SMILES(list(df["SMILES"].unique().tolist()))
df["tokenized_SMILES"] = df["SMILES"].map(smiles_dict)
print(df['SMILES'].unique())

# clean the tokenized SMILES,get a list of cleaned SMILES
df['cleaned_tokenized_SMILES'] = df['tokenized_SMILES'].apply(lambda x: clean_SMILES(x)[0])


tokenizer PreTrainedTokenizerFast(name_or_path='/data/jgut/SMILES_or_SELFIES/tokenizer/smiles_atom_isomers', vocab_size=432, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


100%|██████████| 2/2 [00:00<00:00, 130.70it/s]

SMILES tokens: ['C', '#', '[N+]', '[O-]']
['C#[N+][O-]' 'C=C=CCN']
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '='




In [7]:

print(df)

   Unnamed: 0  atom_idx element  mulliken     resp1     resp2      dual  \
0           0         0       C -0.290919 -0.287521 -0.287521 -0.001670   
1           1         1       N  0.419352  0.458319  0.458319  0.195391   
2           2         2       O -0.397038 -0.535125 -0.535125 -0.351755   
3           3         0       C -0.387290 -0.449151 -0.449151  0.157883   
4           4         1       C  0.022985  0.052905  0.052905  0.220494   
5           5         2       C -0.261951 -0.473223 -0.473223 -0.052606   
6           6         3       C  0.001897  0.634123  0.631025 -0.002175   
7           7         4       N -0.591325 -1.196668 -1.196668 -0.486405   
8           8         0       C -0.387167 -0.462836 -0.462836  0.164734   
9           9         1       C  0.022232  0.069244  0.069244  0.222098   

   cnf_idx  mbis_dipole_strength     DASH_IDX            comp_key      SMILES  \
0        0              0.042085   QMUGS500_1   conf_00QMUGS500_1  C#[N+][O-]   
1        0  

In [None]:
# Select rows up to the first repeated 0 in 'atom_idx' and the next set and the next set
first_smiles = df.iloc[0]['SMILES']
rows = []
seen_zero = False
for idx, row in df.iterrows():
    if seen_zero and row['atom_idx'] == 0:
        break
    rows.append(row)
    if row['atom_idx'] == 0:
        seen_zero = True

selected_df = pd.DataFrame(rows)
print(selected_df)

# Check if all selected rows have the same SMILES
if selected_df['SMILES'].nunique() == 1:
    smiles_toks = selected_df.iloc[0]['cleaned_tokenized_SMILES']
    elements = selected_df['element'].tolist()
    # Compare order of elements to SMILES string
    #print("SMILES:", smiles_toks)
    #print("Elements:", elements)
    assert len(elements) == len(smiles_toks), f"Length mismatch: {len(elements)} != {len(smiles_toks)}"
    # check through elements and SMILES letters
    for elem, tok in zip(elements, smiles_toks):
        #print(f"Element: {elem}, SMILES token: {tok}")
        assert(tok[1] if tok.startswith("[") else tok[0].lower() == elem[0].lower()), f"Atom assignment failed: {tok} != {elem}"
    # You may want to implement a more sophisticated mapping here
else:
    print("Selected rows do not have the same SMILES. Something went wrong, these will not further be considered.")

   Unnamed: 0  atom_idx element  mulliken     resp1     resp2      dual  \
0           0         0       C -0.290919 -0.287521 -0.287521 -0.001670   
1           1         1       N  0.419352  0.458319  0.458319  0.195391   
2           2         2       O -0.397038 -0.535125 -0.535125 -0.351755   

   cnf_idx  mbis_dipole_strength    DASH_IDX           comp_key      SMILES  \
0        0              0.042085  QMUGS500_1  conf_00QMUGS500_1  C#[N+][O-]   
1        0              0.200745  QMUGS500_1  conf_00QMUGS500_1  C#[N+][O-]   
2        0              0.048333  QMUGS500_1  conf_00QMUGS500_1  C#[N+][O-]   

      CHEMBL_ID    tokenized_SMILES cleaned_tokenized_SMILES  
0  CHEMBL185198  [C, #, [N+], [O-]]          [C, [N+], [O-]]  
1  CHEMBL185198  [C, #, [N+], [O-]]          [C, [N+], [O-]]  
2  CHEMBL185198  [C, #, [N+], [O-]]          [C, [N+], [O-]]  
SMILES: ['C', '[N+]', '[O-]']
Elements: ['C', 'N', 'O']
SMILES letters: ['C', '[N+]', '[O-]']
Element: C, SMILES token: C
Element:

In [None]:
df_groupedbySMILES = df.groupby("SMILES")
print(len(df_groupedbySMILES))
valid_groups = []

for smiles, group in df_groupedbySMILES:
    try:
        elements = group['element'].tolist()
        smiles_toks = group.iloc[0]['cleaned_tokenized_SMILES']
        # Compare order of elements to SMILES string
        print("SMILES:", smiles_toks)
        print("Elements:", elements)
        assert len(elements) == len(smiles_toks), f"Length mismatch: {len(elements)} != {len(smiles_toks)}"
        # check through elements and SMILES letters
        for elem, tok in zip(elements, smiles_toks):
            #print(f"Element: {elem}, SMILES token: {tok}")
            assert(tok[1] if tok.startswith("[") else tok[0].lower() == elem[0].lower()), f"Atom assignment failed: {tok} != {elem}"
        #add to valid groups
        
    # You may want to implement a more sophisticated mapping here
    except AssertionError as e:
        print(f"AssertionError: {e}. Skipping SMILES: {smiles}")
        continue
else:
    print("Selected rows do not have the same SMILES. Something went wrong, these will not further be considered.")
   