### Run in ```rdkit2019``` environment

In [4]:
import argparse
import os
import pandas as pd
import random
from rdkit import Chem
from tqdm import tqdm
from tqdm.contrib import tzip
from e_smiles import get_e_smiles, merge_smiles, get_edit_from_e_smiles, merge_smiles_only, get_e_smiles_with_check
from e_smiles import run_get_p_b_l_check, run_get_p_b_l_forward, get_b_smiles_check, iso_to_symbo, symbo_to_iso
from tqdm.contrib.concurrent import process_map

dic_str_to_num = {}
for l in range(4,0,-1):
    for a,i in zip([0,200,400,600,100,300,500,700],['','α','β','γ','δ','αδ','βδ','γδ']):
        for b,j in zip([0,10,20,30],['','r','s','?']):
            for c,k in zip([0,9],['','~']):
                #print(a+b+c,k+j+i)
                if len(k+j+i) == l:
                    #dic_num_to_str[a+b+c] = k+j+i
                    dic_str_to_num[k+j+i] = str(a+b+c)
                    
dic_num_to_str = {}
for l in range(3,0,-1):
    for a,i in zip([0,200,400,600,100,300,500,700],['','α','β','γ','δ','αδ','βδ','γδ']):
        for b,j in zip([0,10,20,30],['','r','s','?']):
            for c,k in zip([0,9],['','~']):
                #print(a+b+c,k+j+i)
                if len(str(a+b+c)) == l and len(k+j+i) != 0:
                    #dic_num_to_str[a+b+c] = k+j+i
                    dic_num_to_str[str(a+b+c)] = k+j+i

rdkit version:2019.03.2


#### Prepare Prompt SMILES by introducing one specific hints (src data)

In [10]:
# Train (aug_100)
augmentated_df = pd.read_csv("datasets/50k_aug/2023_1_8_train_r100.csv")
df = augmentated_df.reset_index(drop = True)

p_b_l = process_map(run_get_p_b_l_check, df['reactants>reagents>production'], max_workers=20)
p_b_l = [i for i in p_b_l if 'error' not in i]
print(f"{len(p_b_l)}/{len(df)}")

p_b_l_prompt = []
for i in range(len(p_b_l)):
    p_b_l_prompt.append([p_b_l[i][0],[random.choice(p_b_l[i][1])],[],[],[],[],[]])

prompt_smiles_lis = process_map(get_b_smiles_check, tqdm(p_b_l_prompt), max_workers = 20)
prompt_smiles_lis = [i for i in prompt_smiles_lis  if 'prompt_smiles_lis' not in i]
prompt_smiles_lis = [iso_to_symbo(prompt_smiles, dic_num_to_str) for prompt_smiles in tqdm(prompt_smiles_lis)]
print(f"{len(p_b_l)}/{len(df)}")

# Write into file
src = [" ".join(list(s)) for s in prompt_smiles_lis]
src_file_path = f"datasets/50k_ReactSeq_with_prompt/aug100_train/src_aug100_train.txt"
os.makedirs(os.path.dirname(src_file_path), exist_ok=True)

with open(src_file_path, "w") as f:
    for line in src: 
        f.write(line+'\n')

  


  0%|          | 0/4000800 [00:00<?, ?it/s]

3994522/4000800


  
100%|██████████| 3994522/3994522 [02:47<00:00, 23793.42it/s]


  0%|          | 0/3994522 [00:00<?, ?it/s]

100%|██████████| 3994522/3994522 [00:43<00:00, 92775.72it/s]

3994522/4000800





In [None]:
# Valid (aug_20)
augmentated_df = pd.read_csv("datasets/50k_aug/2023_1_1_eval_r20.csv")
idx_to_drop = [2302, 2527, 2950, 4368, 4863, 4890]
rows_to_drop = []
for j in range(20):
    rows_to_drop += [j*5001 + i for i in idx_to_drop]            
df = augmentated_df.drop(rows_to_drop)
df = df.reset_index(drop = True)
rxn_class_list = [f"class_{n}" for n in df['class']]    

p_b_l = process_map(run_get_p_b_l_forward, df['reactants>reagents>production'], max_workers=20)
p_b_l = [i for i in p_b_l if 'error' not in i]
print(f"{len(p_b_l)}/{len(df)}")

p_b_l_prompt = []
for i in range(len(p_b_l)):
    p_b_l_prompt.append([p_b_l[i][0],[random.choice(p_b_l[i][1])],[],[],[],[],[]])

prompt_smiles_lis = process_map(get_b_smiles_check, tqdm(p_b_l_prompt), max_workers = 20)
prompt_smiles_lis = [i for i in prompt_smiles_lis  if 'prompt_smiles_lis' not in i]
prompt_smiles_lis = [iso_to_symbo(prompt_smiles, dic_num_to_str) for prompt_smiles in tqdm(prompt_smiles_lis)]
print(f"{len(p_b_l)}/{len(df)}")

# Write into file
src = [" ".join(list(s)) for s in prompt_smiles_lis]
src_file_path = f"datasets/50k_ReactSeq_with_prompt/aug20_val/src_aug20_val.txt"
os.makedirs(os.path.dirname(src_file_path), exist_ok=True)
with open(src_file_path, "w") as f:
    for line in src: 
        f.write(line+'\n')

  if sys.path[0] == "":


  0%|          | 0/99900 [00:00<?, ?it/s]

99900/99900


100%|██████████| 99900/99900 [00:04<00:00, 24576.11it/s]


  0%|          | 0/99900 [00:00<?, ?it/s]

100%|██████████| 99900/99900 [00:01<00:00, 81180.94it/s]

99900/99900





In [8]:
# Test (aug_20)
augmentated_df = pd.read_csv("datasets/50k_aug/2023_1_1_test_r20.csv")
idx_to_drop = [822, 1282, 1490, 1558, 2810, 3487, 4958]
rows_to_drop = []
for j in range(20):
    rows_to_drop += [j*5007 + i for i in idx_to_drop]
df = augmentated_df.drop(rows_to_drop)
df = df.reset_index(drop = True)
rxn_class_list = [f"class_{n}" for n in df['class']]

p_b_l = process_map(run_get_p_b_l_forward, df['reactants>reagents>production'], max_workers=20)
p_b_l = [i for i in p_b_l if 'error' not in i]
print(f"{len(p_b_l)}/{len(df)}")

p_b_l_prompt = []
for i in range(len(p_b_l)):
    p_b_l_prompt.append([p_b_l[i][0],[random.choice(p_b_l[i][1])],[],[],[],[],[]])

prompt_smiles_lis = process_map(get_b_smiles_check, tqdm(p_b_l_prompt), max_workers = 20)
prompt_smiles_lis = [i for i in prompt_smiles_lis  if 'prompt_smiles_lis' not in i]
prompt_smiles_lis = [iso_to_symbo(prompt_smiles, dic_num_to_str) for prompt_smiles in tqdm(prompt_smiles_lis)]
print(f"{len(p_b_l)}/{len(df)}")

# Write into file
src = [" ".join(list(s)) for s in prompt_smiles_lis]
src_file_path = f"datasets/50k_ReactSeq_with_prompt/aug20_test/src_aug20_test.txt"
os.makedirs(os.path.dirname(src_file_path), exist_ok=True)

with open(src_file_path, "w") as f:
    for line in src: 
        f.write(line+'\n')

  if sys.path[0] == "":


  0%|          | 0/100000 [00:00<?, ?it/s]

100000/100000


100%|██████████| 100000/100000 [00:04<00:00, 23567.58it/s]


  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:01<00:00, 80062.63it/s]

100000/100000





### Target Data are same as ```50k_ReactSeq``` (tgt data)

In [None]:
!cp "datasets/50k_ReactSeq/aug20_test/tgt_aug20_test.txt" "datasets/50k_ReactSeq_with_prompt/aug20_test/tgt_aug20_test.txt"
!cp "datasets/50k_ReactSeq/aug20_val/tgt_aug20_val.txt" "datasets/50k_ReactSeq_with_prompt/aug20_val/tgt_aug20_val.txt"
!cp "datasets/50k_ReactSeq/aug100_train/tgt_aug100_train.txt" "datasets/50k_ReactSeq_with_prompt/aug100_train/tgt_aug100_train.txt"