# Extension of ETH dataset dataframe to include:
- mapping atoms of mol-file to SMILES is correct
- SELFIES
- SELFIES mapping

In [15]:
import pandas as pd
from tokenisation import tokenize_dataset, get_tokenizer
from constants import (
    TOKENIZER_PATH
)
from typing import List
import numpy as np
from SMILES_to_SELFIES_mapping import canonize_smiles, generate_mapping, generate_mappings_for_task_SMILES_to_SELFIES

In [16]:
def clean_SMILES(SMILES_tok):
    """Cleaning of SMILES tokens input from hydrogens and digits

    Args:
        SMILES_tok (_list_): List of SMILES_tokens for a given SMILES

    Returns:
        _list,list_: Processed SMILES_token list and list of positions in input tokens list that were kept 
        (needed to distinguish which embeddings are relevant)
    """
    SMILES_tok_prep = list()
    struc_toks = r"()=:~1234567890#/\\"
    posToKeep = list()
    pos = 0
    for i in range(len(SMILES_tok)):
        # If token is bracketed, keep as is, even if contains H or other atoms or numbers
        if SMILES_tok[i].startswith("[") and SMILES_tok[i].endswith("]"):
            SMILES_tok_prep.append(SMILES_tok[i])
            posToKeep.append(pos)
        # when it's an H in the SMILES, ignore, cannot deal
        elif SMILES_tok[i] != "H" and SMILES_tok[i] != "h" and not SMILES_tok[i].isdigit() and not SMILES_tok[i].isspace():
            if not any(elem in struc_toks for elem in SMILES_tok[i]):
                if SMILES_tok[i] != "-":
                    SMILES_tok_prep.append(SMILES_tok[i])
                    # keep pos where you keep SMILES token
                    posToKeep.append(pos)
        pos += 1
    assert(len(posToKeep) == (len(SMILES_tok_prep))
           ), f"Length of positions-to-keep-array ({len(posToKeep)}) and length of SMILES_tok_prep ({len(SMILES_tok_prep)}) are not the same"
    print("SMILES_tok: ", SMILES_tok)
    print("posToKeep: ", posToKeep)
    print("SMILES_tok_prep: ", SMILES_tok_prep)

    return SMILES_tok_prep, posToKeep

def get_tokenized_SMILES(task_SMILES: List[str]):
    """Tokenize SMILES string

    Args:
        input_list of strings (str): List of SMILES input string

    Returns:
        dict: dictionary that links canonize SMILES string
    """
    tokenizer = get_tokenizer(TOKENIZER_PATH)
    print(f"tokenizer {tokenizer}")
    smi_toks = tokenize_dataset(tokenizer, task_SMILES, False)
    smi_toks = [smi_tok.split() for smi_tok in smi_toks]
    print(f"SMILES tokens: {smi_toks[0]}")
    smiles_dict = dict(zip(task_SMILES,smi_toks))
    return smiles_dict

In [7]:
# 1. Read in ETH dataset as of now
csv_data = "/data/jgut/SMILES_or_SELFIES/dash_dataset.csv"
orig_df = pd.read_csv(csv_data)
print("Len before filtering out all the non-zero conformation indeces: ",len(orig_df))

# 2. Filter out all the data that has cnf_idx unequal to 0, we only want the 0 conformations
df_onlycnf0 = orig_df[orig_df['cnf_idx'] == 0]
print("Len after filtering out all the non-zero conformation indeces: ",len(df_onlycnf0))

print("Unique SMILES in dataframe: ", df_onlycnf0['SMILES'].nunique())
#print(df.head())
# show first 10 lines
print(df_onlycnf0.head(10))

#limit amount for now
#df = df[:100]

Len before filtering out all the non-zero conformation indeces:  1216627
Len after filtering out all the non-zero conformation indeces:  410568
Unique SMILES in dataframe:  16058
    Unnamed: 0  atom_idx element  mulliken     resp1     resp2      dual  \
0            0         0       C -0.290919 -0.287521 -0.287521 -0.001670   
1            1         1       N  0.419352  0.458319  0.458319  0.195391   
2            2         2       O -0.397038 -0.535125 -0.535125 -0.351755   
3            3         0       C -0.387290 -0.449151 -0.449151  0.157883   
4            4         1       C  0.022985  0.052905  0.052905  0.220494   
5            5         2       C -0.261951 -0.473223 -0.473223 -0.052606   
6            6         3       C  0.001897  0.634123  0.631025 -0.002175   
7            7         4       N -0.591325 -1.196668 -1.196668 -0.486405   
18          18         0       C -0.335717 -0.263623 -0.158985 -0.002946   
19          19         1       C -0.057196  0.322966  0.32449

In [14]:
# 3. Find SMILES with repeated atom indices because filtering with cnf_idx==0 just isn't enough
repeated_count = 0
smiles_with_repeats = set()
for smiles, group in df_onlycnf0.groupby("SMILES"):
    repeated = group['atom_idx'].duplicated(keep=False)# keep False marks all duplicates as True
    if repeated.any():
        print(f"SMILES: {smiles} has repeated atom_idx values!")
        print(group[repeated])
        smiles_with_repeats.add(smiles)
        repeated_count += 1
print(f"Total SMILES with repeated atom_idx: {repeated_count}")

# Excluding SMILES that have repeated atom indices and thereby conflicting assignments
df = df_onlycnf0[~df_onlycnf0['SMILES'].isin(smiles_with_repeats)]
print(f"Original df with only cnf_idx=0: {len(df_onlycnf0)} and after filtering repeated atom indices: {len(df)}")
print(len(df_onlycnf0)-(len(df_onlycnf0[df_onlycnf0['SMILES'].isin(smiles_with_repeats)])), len(df))
assert (((len(df_onlycnf0)-(len(df_onlycnf0[df_onlycnf0['SMILES'].isin(smiles_with_repeats)]))) == len(df)))

SMILES: C(=C/c1c[nH]c2ccccc12)\c1nccc2ccccc12 has repeated atom_idx values!
       Unnamed: 0  atom_idx element  mulliken     resp1     resp2      dual  \
96682       96682         0       C -0.333409 -0.352943 -0.352943 -0.102599   
96683       96683         1       C -0.195437 -0.040475 -0.040475  0.055935   
96684       96684         2       C  0.103759 -0.058088 -0.058088 -0.108043   
96685       96685         3       C -0.098838 -0.210778 -0.210778 -0.030810   
96686       96686         4       N -0.174250 -0.278429 -0.278429 -0.057414   
96687       96687         5       C  0.198440  0.122461  0.122461 -0.008095   
96688       96688         6       C -0.286046 -0.273765 -0.273765 -0.023457   
96689       96689         7       C -0.097331 -0.148216 -0.148216 -0.026235   
96690       96690         8       C -0.176356 -0.219470 -0.219470 -0.006756   
96691       96691         9       C -0.226802 -0.280331 -0.280331 -0.037221   
96692       96692        10       C  0.027914  0.154697

In [23]:
# 4. For every SMILES create a SELFIES and create mapping between SMILES and SELFIES
# A) For this we need to ensure that SMILES are canonical
print("Checking for non-canonical SMILES and filtering them out")
unique_SMILES = df['SMILES'].unique().tolist()
# canonize those SMILES and tell me they turn out to be the same as before
task_SMILES_canonized = [canonize_smiles(smile) for smile in unique_SMILES]
print("Canonized SMILES:")
#print(task_SMILES_canonized)
# compare to unique_SMILES
mismatches_num = 0
mismatches_dfSMILES = []
for original, canonized in zip(unique_SMILES, task_SMILES_canonized):
    if (original != canonized):
        print(f"MISMATCH: Original: {original}, Canonized: {canonized}")
        mismatches_num += 1
        mismatches_dfSMILES.append((original))
print(f"Total mismatches found: {mismatches_num}")

# filter out the mismatching SMILES from df
outcome_num = len(df) - len(df[df['SMILES'].isin(mismatches_dfSMILES)])
print(f"Original df size: {len(df)} minus mismatches: {mismatches_num} should be {len(df) - len(df[df['SMILES'].isin(mismatches_dfSMILES)])}")
df = df[~df['SMILES'].isin(mismatches_dfSMILES)]
print(len(df))
assert (len(df) == outcome_num), "We have a number mismatch"

# B) For every SMILES create a SELFIES and create mapping between SMILES and SELFIES
print("Mapping SMILES and SELFIES")
unique_SMILES = df['SMILES'].unique().tolist()
smiles_to_selfies_mapping = generate_mappings_for_task_SMILES_to_SELFIES(unique_SMILES)
print("--Mapped")
print("Create dict of SMILES that have both atom types and SELFIES matching")
nonmappings_nu = 0
smilestoatomtypestoselfies_dikt = dict()
nomapping_smiles = []
for smiles in unique_SMILES:
    #print('atom types: ',atom_types)
    selfies = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[0]
    selfies_toks = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[1]
    selfies_map = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[2]
    #print('selfies map: ',selfies_map)
    #check that neither is empty
    if selfies_map is not None: # add to original df
        df.loc[df['SMILES'] == smiles, 'selfies'] = [selfies] * len(df.loc[df['SMILES'] == smiles])
        df.loc[df['SMILES'] == smiles, 'selfies_toks'] = [selfies_toks] * len(df.loc[df['SMILES'] == smiles])
        # assign same map to several rows with same SMILES
        df.loc[df['SMILES'] == smiles, 'selfies_map'] = [selfies_map] * len(df.loc[df['SMILES'] == smiles])
    else:
        nonmappings_nu += 1
        nomapping_smiles.append(smiles)
        df.loc[df['SMILES'] == smiles, 'selfies'] = None
        df.loc[df['SMILES'] == smiles, 'selfies_toks'] = None
        df.loc[df['SMILES'] == smiles, 'selfies_map'] = None
print(f"Number of SMILES without mapping to SELFIES: {nonmappings_nu}")
allmap_num = len(df) - len(df[df['SMILES'].isin(nomapping_smiles)])
print(f"{len(df) - len(df[df['SMILES'].isin(nomapping_smiles)])}")
 # And filter out all the none mapping SMILES
df = df[df['selfies_map'].notna()]
print(len(df))
assert allmap_num == len(df), "We have a number mismatch"



Checking for non-canonical SMILES and filtering them out
Canonized SMILES:
Total mismatches found: 0
Original df size: 397911 minus mismatches: 0 should be 397911
397911
Mapping SMILES and SELFIES
--Mapped
Create dict of SMILES that have both atom types and SELFIES matching


ValueError: Must have equal len keys and value when setting with an ndarray

In [None]:
#4. Add tokenized SMILES and tokenized SMILES without structural tokens to df

# add a column of the SMILES as tokenized SMILES to df
smiles_dict = get_tokenized_SMILES(list(df["SMILES"].unique().tolist()))
df["tokenized_SMILES"] = df["SMILES"].map(smiles_dict)
print(df['SMILES'].unique(), len(df['SMILES'].unique()))

# clean the tokenized SMILES,get a list of cleaned SMILES
df['cleaned_tokenized_SMILES'] = df['tokenized_SMILES'].apply(lambda x: clean_SMILES(x)[0])


tokenizer PreTrainedTokenizerFast(name_or_path='/data/jgut/SMILES_or_SELFIES/tokenizer/smiles_atom_isomers', vocab_size=432, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


100%|██████████| 8/8 [00:00<00:00, 4850.31it/s]

SMILES tokens: ['C', '#', '[N+]', '[O-]']
['C#[N+][O-]' 'C=C=CCN' 'CCOC=O' 'NCC(=O)O' 'NNC(N)=O' 'N#CC#CCN'
 'N=C(N)C1CC1' 'CC(=O)C(=O)O'] 8
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '#', '[N+]', '[O-]']
posToKeep:  [0, 2, 3]
SMILES_tok_prep:  ['C', '[N+]', '[O-]']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4, 5, 6]
SMILES_tok_prep:  ['C', 'C', 'C', 'C', 'N']
SMILES_tok:  ['C', '=', 'C', '=', 'C', 'C', 'N']
posToKeep:  [0, 2, 4




In [None]:
# 3. Group by SMILES and make sure that assignments of atoms are same as expected
df_groupedbySMILES = df.groupby("SMILES")
print(len(df_groupedbySMILES))
valid_groups = []

for smiles, group in df_groupedbySMILES:
    try:
        elements = group['element'].tolist()
        smiles_toks = group.iloc[0]['cleaned_tokenized_SMILES']
        # Compare order of elements to SMILES string
        print("SMILES:", smiles_toks)
        print("Elements:", elements)
        assert len(elements) == len(smiles_toks), f"Length mismatch: {len(elements)} != {len(smiles_toks)}"
        # check through elements and SMILES letters
        for elem, tok in zip(elements, smiles_toks):
            #print(f"Element: {elem}, SMILES token: {tok}")
            assert(tok[1] if tok.startswith("[") else tok[0].lower() == elem[0].lower()), f"Atom assignment failed: {tok} != {elem}"
        #add to valid groups
        
    # You may want to implement a more sophisticated mapping here
    except AssertionError as e:
        print(f"AssertionError: {e}. Skipping SMILES: {smiles}")
        continue
else:
    print("Selected rows do not have the same SMILES. Something went wrong, these will not further be considered.")
   