## File to map dictionary of SMILES, tokenized SMILES, and atom types as returned by `2_SMILEStoAtomAssignments.py` to SELFIES without plotting

File creation August 2025, use fairseq_git2 as environment

In [2]:
import os, sys
import json
import logging
from typing import List
from tokenisation import tokenize_dataset, get_tokenizer
from pathlib import Path
from fairseq_utils2 import compute_model_output, compute_model_output_RoBERTa, compute_random_model_output, load_dataset, load_model
from fairseq.data import Dictionary
from SMILES_to_SELFIES_mapping import canonize_smiles, generate_mapping, generate_mappings_for_task_SMILES_to_SELFIES
from itertools import chain
from constants import SEED
import numpy as np
import matplotlib.colors as mcolors
from matplotlib.ticker import MultipleLocator
from collections import Counter
import pandas as pd

from constants import (
    TASK_PATH,
    MOLNET_DIRECTORY,
    TOKENIZER_PATH
)

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ifender/miniconda3/envs/fairseq_git2/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [14]:
def check_lengths(smi_toks, embeds):
    """Check that number of tokens corresponds to number of embeddings per SMILES, otherwise sth went wrong
     new: if sth went wrong turn that embedding to None and return the embeddings

    Args:
        smi_toks (_list[string]_): SMILES tokens for a SMILES
        embeds (_list[float]_): Embeddings
    """
    samenums = 0
    diffnums = 0
    smismaller = 0
    new_embs = list()
    for smi, embs in zip(smi_toks, embeds[0]):
        # only compare when both are not None)
        if embs is not None and smi is not None:
            if len(smi) == len(embs):
                samenums += 1
                new_embs.append(embs)
            else:
                print(f"smilen: {len(smi)} emblen: {len(embs)}")
                embs_signs = [emb1 for (emb0,emb1) in embs]
                print(f"smi: {smi} \nemb: {embs_signs} \nwith len diff {len(smi)-len(embs)}")
                diffnums += 1
                new_embs.append(None)
                if len(smi) < len(embs):
                    smismaller += 1
    embeds[0]=new_embs
    if diffnums == 0:
        return embeds
    else:
        print(
            f"same numbers between tokens and embeddings: {samenums} and different number betqween tokens and embeddings: {diffnums} of which smiles tokens have smaller length: {smismaller}")
        perc = (diffnums/(diffnums+samenums))*100
        print(
            "percentage of embeddings not correct compared to smiles: {:.2f}%".format(perc))
        return embeds

def get_embeddings(task: str, specific_model_path: str, data_path: str, cuda: int, task_reps: List[str]):
    """Generate the embeddings dict of a task
    Args:
        task (str): Task to find attention of
        cuda (int): CUDA device to use
    Returns:
        Tuple[List[List[float]], np.ndarray]: attention, labels
    """
    #task_SMILES, task_labels = load_molnet_test_set(task)
    print("in get embeddings")
    # Ensure specific_model_path and data_path are not None
    if specific_model_path is None:
        raise ValueError("specific_model_path cannot be None")
    if data_path is None:
        raise ValueError("data_path cannot be None")
    
    #data_path = "/data/jgut/SMILES_or_SELFIES/task/delaney/smiles_atom_isomers"
    print("data path: ", data_path)
    if "random" not in str(specific_model_path):
        print("loading model")
        model = load_model(specific_model_path, data_path, cuda)
        print("model loaded")
        model.zero_grad()
    data_path = data_path / "input0" / "test"
    # True for classification, false for regression
    print("loading dataset with datapath: ", data_path)
    dataset = load_dataset(data_path, True)
    print("datapath srcdict:",str(data_path.parent / "dict.txt"))
    source_dictionary = Dictionary.load(str(data_path.parent / "dict.txt"))

    # only works if run on whole dataset
    assert len(task_reps) == len(
        dataset
    ), f"Real and filtered dataset {task} do not have same length: len(task_reps): {len(task_reps)} vs. len(dataset):{len(dataset)} ."
    

    #text = [canonize_smile(smile) for smile in task_SMILES]
    text = [rep for rep in task_reps]
    embeds= []
    tokenizer = None
    if "bart" in str(specific_model_path):
        embeds.append(
            compute_model_output(
                dataset,
                model,
                text, #this is very important to be in same order as task_SMILES which it is
                source_dictionary,
                False,
                False,
                True,  # true for embeddings
                True,  # true for eos_embeddings
                tokenizer,
            )[2]
        )
    if "roberta" in str(specific_model_path):
        embeds.append(
            compute_model_output_RoBERTa(
                dataset,
                model,
                text,
                source_dictionary,
                False,
                False,
                True,  # true for embeddings
                True,  # true for eos_embeddings
                tokenizer,
            )[2]
        )
   # print("attention encodings",len(attention_encodings[0]))
   # print(len(attention_encodings))
    #output = list(zip(*embeds))
    #labels = np.array(task_labels).transpose()[0]
    # print("labels",labels)
    # print(len(labels))
    return embeds

def get_embeddings_from_model(task, traintype, model, rep, reps, listoftokenisedreps):
    # ----------------------specific model paths for Delaney for BART and RoBERTa-------------------------
    finetuned_TASK_MODEL_PATH = Path("/data2/jgut/SoS_models")
    pretrained_TASK_MODEL_PATH = Path("/data/jgut/SMILES_or_SELFIES/prediction_models")
    
    # path to finetuned models
    subfolder=""
    if rep=="smiles":
        #subfolder = "smiles_atom_isomers"
        subfolder = "smiles_atom_standard"
    elif rep=="selfies":
        #subfolder="selfies_atom_isomers"
        subfolder="selfies_atom_standard"
        
    if model!="random":
        if traintype=="pretrained":
            if model=="BART":
                # path for BART   
                specific_model_path = (
                    pretrained_TASK_MODEL_PATH
                    / f"{subfolder}_bart"
                    / "checkpoint_last.pt"
                ) 
            else:
                #path for RoBERTa
                specific_model_path = (
                pretrained_TASK_MODEL_PATH
                / f"{subfolder}_roberta"
                / "checkpoint_last.pt"
                )
    print("specific model path: ",specific_model_path)
    
    if specific_model_path is None:
        raise ValueError("specific_model_path cannot be None")
    data_path = Path("/scratch/ifender/SOS_tmp/")
    #data_path = TASK_PATH/"bbbp"/f"{subfolder}"
    #fairseq_dict = Dictionary.load(str(fairseq_dict_path))
    #fairseq_dict_path = FAIRSEQ_PREPROCESS_PATH/ "smiles_atom_isomers"/"dict.txt"
    
    embeds = []
    embeds = get_embeddings(task, specific_model_path, data_path, False, reps) #works for BART model with newest version of fairseq on github, see fairseq_git.yaml file
    checked_embeds = check_lengths(listoftokenisedreps, embeds) #, "Length of SMILES_tokens and embeddings do not agree."
    print("got the embeddings")
    return checked_embeds

1. Load assigned atom types

In [3]:
# get the mapping SMILES to atom types from dict.json
# Load the dictionary from the JSON file
diktfolder = "/home/ifender/SOS/SMILES_or_SELFIES/atomtype_embedding_visualisation/assignment_dicts/dikt_pretraindataset.json"
with open(diktfolder, 'r') as file:
    loaded_dikt = json.load(file)
#print first few lines
n=0
for k, v in loaded_dikt.items():
    print(f"SMILES: {k} -> Atom Types: {v}")
    if v['posToKeep'] is None:
        print("NONE!")
    n+=1
    if n>5: break


SMILES: CN(C)C(=O)c1ccc2oc(-c3cc(Cl)cc(Cl)c3Cl)nc2c1 -> Atom Types: {'posToKeep': None, 'smi_clean': None, 'atom_types': None, 'max_penalty': None}
NONE!
SMILES: CC(C)C1CCC(C(=O)Cc2cn(C)nn2)CC1 -> Atom Types: {'posToKeep': [0, 1, 3, 5, 7, 8, 9, 11, 14, 16, 17, 19, 20, 22, 24, 25, 28, 29], 'smi_clean': ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'O', 'C', 'c', 'c', 'n', 'C', 'n', 'n', 'C', 'C'], 'atom_types': ['c3', 'c3', 'c3', 'c3', 'c3', 'c3', 'c3', 'c', 'o', 'c3', 'cc', 'cd', 'na', 'c3', 'nd', 'nc', 'c3', 'c3'], 'max_penalty': 67.2}
SMILES: Cc1ccc(C)c(C(C#N)Cc2c[nH]c3ccccc23)c1 -> Atom Types: {'posToKeep': [0, 1, 3, 4, 5, 7, 9, 11, 13, 15, 17, 18, 20, 21, 22, 24, 25, 26, 27, 28, 32], 'smi_clean': ['C', 'c', 'c', 'c', 'c', 'C', 'c', 'C', 'C', 'N', 'C', 'c', 'c', '[nH]', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], 'atom_types': ['c3', 'ca', 'ca', 'ca', 'ca', 'c3', 'ca', 'c3', 'c1', 'n1', 'c3', 'cc', 'cd', 'na', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca'], 'max_penalty': 223.0}
SMILES: Cc1ccc(C)c([C

In [4]:
print(len(loaded_dikt))

4000


2. Load all the SMILES and generate mapping between SMILES and SELFIES

In [3]:
csv = '/data/jgut/SMILES_or_SELFIES/processed/isomers/full_deduplicated_isomers.csv'
df = pd.read_csv(csv)
task_SMILES = df['SMILES'].tolist()
limit = 4000
task_SMILES = task_SMILES[:50]
#print('Canonizing SMILES')
task_SMILES = [canonize_smiles(smiles) for smiles in task_SMILES]

#this is what smiles_to_selfies_mapping looks like: mappings[smiles]['selfiesstr_tok_map'] = (selfies_str,tokenised_selfies,mapping)
smiles_to_selfies_mapping = generate_mappings_for_task_SMILES_to_SELFIES(task_SMILES)
# smiles to selfies maps selfies index with token to corresponding smiles token, e.g. (2, '[C]'): 3
for k, v in smiles_to_selfies_mapping.items():
    print(f"SMILES: {k} -> SELFIES: {v}")
    selfies = v['selfiesstr_tok_map'][0]
    selfies_toks = v['selfiesstr_tok_map'][1]
    selfies_map = v['selfiesstr_tok_map'][2]
    print(f"{selfies}")
    print(f"{selfies_toks}")
    print(f"{selfies_map}")

SMILES: CN(C)C(=O)c1ccc2oc(-c3cc(Cl)cc(Cl)c3Cl)nc2c1 -> SELFIES: {'selfiesstr_tok_map': ('[C][N][Branch1][C][C][C][=Branch1][C][=O][C][=C][C][=C][O][C][Branch1][S][C][=C][C][Branch1][C][Cl][=C][C][Branch1][C][Cl][=C][Ring1][Branch2][Cl][=N][C][Ring1][=C][=C][Ring2][Ring1][C]', ['[C]', '[N]', '[Branch1]', '[C]', '[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[C]', '[=C]', '[C]', '[=C]', '[O]', '[C]', '[Branch1]', '[S]', '[C]', '[=C]', '[C]', '[Branch1]', '[C]', '[Cl]', '[=C]', '[C]', '[Branch1]', '[C]', '[Cl]', '[=C]', '[Ring1]', '[Branch2]', '[Cl]', '[=N]', '[C]', '[Ring1]', '[=C]', '[=C]', '[Ring2]', '[Ring1]', '[C]'], None)}
[C][N][Branch1][C][C][C][=Branch1][C][=O][C][=C][C][=C][O][C][Branch1][S][C][=C][C][Branch1][C][Cl][=C][C][Branch1][C][Cl][=C][Ring1][Branch2][Cl][=N][C][Ring1][=C][=C][Ring2][Ring1][C]
['[C]', '[N]', '[Branch1]', '[C]', '[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[C]', '[=C]', '[C]', '[=C]', '[O]', '[C]', '[Branch1]', '[S]', '[C]', '[=C]', '[C]', '[Branch1]', '[C]'

In [5]:
smiles_to_selfies_mapping = generate_mappings_for_task_SMILES_to_SELFIES(task_SMILES)
selfies_tokenised = []
selfies = []
maps_num = 0
for key in smiles_to_selfies_mapping.keys():
    #print(f"SMILES: {key} SELFIES: {smiles_to_selfies_mapping[key]}")
    selfies_tokenised.append(smiles_to_selfies_mapping[key]['selfiesstr_tok_map'][1])
    selfies.append(smiles_to_selfies_mapping[key]['selfiesstr_tok_map'][0])
    if smiles_to_selfies_mapping[key]['selfiesstr_tok_map'][2] is not None:
        maps_num +=1
        for key2,val in smiles_to_selfies_mapping[key]['selfiesstr_tok_map'][2].items():
            print(f"SELFIES index:{key2[0]} with token:{key2[1]}\tmaps to SMILES token at pos: {val} in SMILES: {key[val]}")
    print()





SELFIES index:0 with token:[C]	maps to SMILES token at pos: 0 in SMILES: C
SELFIES index:1 with token:[C]	maps to SMILES token at pos: 1 in SMILES: c
SELFIES index:2 with token:[=N]	maps to SMILES token at pos: 3 in SMILES: n
SELFIES index:3 with token:[O]	maps to SMILES token at pos: 4 in SMILES: o
SELFIES index:4 with token:[C]	maps to SMILES token at pos: 5 in SMILES: c
SELFIES index:7 with token:[C]	maps to SMILES token at pos: 7 in SMILES: C
SELFIES index:8 with token:[=C]	maps to SMILES token at pos: 9 in SMILES: c
SELFIES index:11 with token:[S]	maps to SMILES token at pos: 11 in SMILES: S
SELFIES index:14 with token:[=O]	maps to SMILES token at pos: 14 in SMILES: O
SELFIES index:17 with token:[=O]	maps to SMILES token at pos: 18 in SMILES: O
SELFIES index:18 with token:[N]	maps to SMILES token at pos: 20 in SMILES: N
SELFIES index:19 with token:[C]	maps to SMILES token at pos: 21 in SMILES: C
SELFIES index:20 with token:[C]	maps to SMILES token at pos: 23 in SMILES: C
SELFI

3. Merge smiles_to_selfies_mapping with the atom types to exclude SMILES where assignment or mapping failed
--> get list of SMILES and SELFIES to get embeddings for

In [6]:
smilestoatomtypestoselfies_dikt = dict()
for smiles in task_SMILES:
    print(smiles)
    atom_types = loaded_dikt.get(smiles, {}).get('atom_types', None)
    print('atom types: ',atom_types)
    selfies = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[0]
    selfies_toks = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[1]
    selfies_map = smiles_to_selfies_mapping.get(smiles, {}).get('selfiesstr_tok_map', (None, None, None))[2]
    print('selfies map: ',selfies_map)
    #check that neither is empty
    if selfies_map is not None and atom_types is not None:
        # final dict will have as keys to value: 'posToKeep', 'smi_clean', 'atom_types', 'max_penalty'
        smilestoatomtypestoselfies_dikt[smiles] = {**loaded_dikt.get(smiles, {}), 'selfies': selfies, 'selfies_toks': selfies_toks, 'selfies_map': selfies_map}
print(smilestoatomtypestoselfies_dikt)

CN(C)C(=O)c1ccc2oc(-c3cc(Cl)cc(Cl)c3Cl)nc2c1
atom types:  None
selfies map:  None
CC(C)C1CCC(C(=O)Cc2cn(C)nn2)CC1
atom types:  ['c3', 'c3', 'c3', 'c3', 'c3', 'c3', 'c3', 'c', 'o', 'c3', 'cc', 'cd', 'na', 'c3', 'nd', 'nc', 'c3', 'c3']
selfies map:  None
Cc1ccc(C)c(C(C#N)Cc2c[nH]c3ccccc23)c1
atom types:  ['c3', 'ca', 'ca', 'ca', 'ca', 'c3', 'ca', 'c3', 'c1', 'n1', 'c3', 'cc', 'cd', 'na', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca']
selfies map:  None
Cc1ccc(C)c([C@@H](C#N)Cc2c[nH]c3ccccc23)c1
atom types:  ['c3', 'ca', 'ca', 'ca', 'ca', 'c3', 'ca', 'c3', 'c1', 'n1', 'c3', 'cc', 'cd', 'na', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca', 'ca']
selfies map:  None
Cc1noc(C)c1S(=O)(=O)NC1CCOc2ccccc21
atom types:  None
selfies map:  {(0, '[C]'): 0, (1, '[C]'): 1, (2, '[=N]'): 3, (3, '[O]'): 4, (4, '[C]'): 5, (7, '[C]'): 7, (8, '[=C]'): 9, (11, '[S]'): 11, (14, '[=O]'): 14, (17, '[=O]'): 18, (18, '[N]'): 20, (19, '[C]'): 21, (20, '[C]'): 23, (21, '[C]'): 24, (22, '[O]'): 25, (23, '[C]'): 26, (24, '[=C]'): 28,

In [7]:
print(smilestoatomtypestoselfies_dikt)
print(len(smilestoatomtypestoselfies_dikt))
for k, v in smilestoatomtypestoselfies_dikt.items():
    print(f"SMILES: {k}")
    print(f"Atom Types: {v.get('atom_types', [])}")
    print(f"Penalty assigned: {v.get('max_penalty', 0)}")
    print(f"SELFIES: {v.get('selfies', [])}")
    print(f"SELFIES Tokens: {v.get('selfies_toks', [])}")
    print(f"SELFIES Map: {v.get('selfies_map', [])}")
    print()
finaltaskSMILES=smilestoatomtypestoselfies_dikt.keys()

{'CC(C)NC(=S)N1CCOC(C)C1C': {'posToKeep': [0, 1, 3, 5, 6, 9, 11, 13, 14, 15, 16, 18, 20, 22], 'smi_clean': ['C', 'C', 'C', 'N', 'C', 'S', 'N', 'C', 'C', 'O', 'C', 'C', 'C', 'C'], 'atom_types': ['c3', 'c3', 'c3', 'ns', 'cs', 's', 'n', 'c3', 'c3', 'os', 'c3', 'c3', 'c3', 'c3'], 'max_penalty': 0.0, 'selfies': '[C][C][Branch1][C][C][N][C][=Branch1][C][=S][N][C][C][O][C][Branch1][C][C][C][Ring1][#Branch1][C]', 'selfies_toks': ['[C]', '[C]', '[Branch1]', '[C]', '[C]', '[N]', '[C]', '[=Branch1]', '[C]', '[=S]', '[N]', '[C]', '[C]', '[O]', '[C]', '[Branch1]', '[C]', '[C]', '[C]', '[Ring1]', '[#Branch1]', '[C]'], 'selfies_map': {(0, '[C]'): 0, (1, '[C]'): 1, (4, '[C]'): 3, (5, '[N]'): 5, (6, '[C]'): 6, (9, '[=S]'): 9, (10, '[N]'): 11, (11, '[C]'): 13, (12, '[C]'): 14, (13, '[O]'): 15, (14, '[C]'): 16, (17, '[C]'): 18, (18, '[C]'): 20, (21, '[C]'): 22}}, 'CC(C)NC(=S)N1CCO[C@@H](C)[C@H]1C': {'posToKeep': [0, 1, 3, 5, 6, 9, 11, 13, 14, 15, 16, 18, 20, 22], 'smi_clean': ['C', 'C', 'C', 'N', 'C', 'S

In [8]:
print(finaltaskSMILES)

dict_keys(['CC(C)NC(=S)N1CCOC(C)C1C', 'CC(C)NC(=S)N1CCO[C@@H](C)[C@H]1C'])


4. Get embeddings (clean meaning without structural tokens so that number of tokens eq. number of embeddings) for SMILES, for SELFIES, and for both models BART and RoBERTa

In [15]:
# go through merged dictionary of atom types and selfies mappings and get embeddings for those
# --> probably have to get embeddings for all anyway, because problems otherwise
# we need to get the tokenized versions of SMILES again as not saved previously
# get tokenized version of dataset
tokenizer = get_tokenizer(TOKENIZER_PATH)
print(f"tokenizer {tokenizer}")
smi_toks = tokenize_dataset(tokenizer, task_SMILES, False)
#print("whole SMILES tokenized: ",smi_toks[0])
smi_toks = [smi_tok.split() for smi_tok in smi_toks]
#print(f"SMILES tokens after splitting tokens into single strings: {smi_toks[0]}")
smiles_dict = dict(zip(task_SMILES,smi_toks))

for k, v in smiles_dict.items():
    print(f"SMILES: {k}")
    print(f"Tokens: {v}")


# get embeddings from model BART and RoBERTa
# task can be anything, 
task = 'pretrained'
# traintype chose pretrained, 
traintype="pretrained"
# model one time BART one time RoBERTa, 
model ="BART"
# rep chose one 
rep = "smiles"  # or "selfies" if you want to do selfies
#data_path = PATH("/data/jgut/SMILES_or_SELFIES/fairseq_preprocess/smiles_atom_standard")
#print((str(data_path.parent / "dict.txt")))
#do for BART
embeds_smiles = get_embeddings_from_model(task, traintype, model, rep, smiles_dict.keys(), smiles_dict.values())

'''
map_embeddings_to_atomtypes(dikt,task_SMILES)


# model roberta
model ="roberta"

embeds_smiles = get_embeddings_from_model(task, traintype, model, rep, smiles_dict.keys(), smiles_dict.values())

map_embeddings_to_atomtypes(dikt,task_SMILES)



embeds_selfies = get_embeddings_from_model(task, traintype, model, rep, selfies, selfies_tokenised)

map_selfies_embeddings_to_smiles(embeds_selfies, smiles_to_selfies_mapping, dikt)
'''

tokenizer PreTrainedTokenizerFast(name_or_path='/data/jgut/SMILES_or_SELFIES/tokenizer/smiles_atom_isomers', vocab_size=432, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


100%|██████████| 50/50 [00:00<00:00, 2025.37it/s]

SMILES: CN(C)C(=O)c1ccc2oc(-c3cc(Cl)cc(Cl)c3Cl)nc2c1
Tokens: ['C', 'N', '(', 'C', ')', 'C', '(', '=', 'O', ')', 'c', '1', 'c', 'c', 'c', '2', 'o', 'c', '(', '-', 'c', '3', 'c', 'c', '(', 'Cl', ')', 'c', 'c', '(', 'Cl', ')', 'c', '3', 'Cl', ')', 'n', 'c', '2', 'c', '1']
SMILES: CC(C)C1CCC(C(=O)Cc2cn(C)nn2)CC1
Tokens: ['C', 'C', '(', 'C', ')', 'C', '1', 'C', 'C', 'C', '(', 'C', '(', '=', 'O', ')', 'C', 'c', '2', 'c', 'n', '(', 'C', ')', 'n', 'n', '2', ')', 'C', 'C', '1']
SMILES: Cc1ccc(C)c(C(C#N)Cc2c[nH]c3ccccc23)c1
Tokens: ['C', 'c', '1', 'c', 'c', 'c', '(', 'C', ')', 'c', '(', 'C', '(', 'C', '#', 'N', ')', 'C', 'c', '2', 'c', '[nH]', 'c', '3', 'c', 'c', 'c', 'c', 'c', '2', '3', ')', 'c', '1']
SMILES: Cc1ccc(C)c([C@@H](C#N)Cc2c[nH]c3ccccc23)c1
Tokens: ['C', 'c', '1', 'c', 'c', 'c', '(', 'C', ')', 'c', '(', '[C@@H]', '(', 'C', '#', 'N', ')', 'C', 'c', '2', 'c', '[nH]', 'c', '3', 'c', 'c', 'c', 'c', 'c', '2', '3', ')', 'c', '1']
SMILES: Cc1noc(C)c1S(=O)(=O)NC1CCOc2ccccc21
Tokens: ['C', 'c




model loaded
loading dataset with datapath:  /scratch/ifender/SOS_tmp/input0/test
datapath dict location: /scratch/ifender/SOS_tmp/input0/dict.txt
dikt was loaded, now to indexed dataset..datapath location: /scratch/ifender/SOS_tmp/input0
datapath srcdict: /scratch/ifender/SOS_tmp/input0/dict.txt


AssertionError: Real and filtered dataset pretrained do not have same length: len(task_reps): 50 vs. len(dataset):9989182 .