# Write the SMILES you want to predict Here

In [None]:
smiles_list = ["C[C@@H]1CC[C@@H](C(C)C)C=C1", 'C[C@@H]1CC[C@H](C(C)C)C=C1']
project_name = "p-menth-2-ene"

# Then run all following codes

## Install Package

In [4]:
! nvidia-smi

Sat Dec 27 02:58:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   69C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
! uv pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
! uv pip install easydict rdkit tensorboard tqdm wandb pandas scikit-fingerprints
! uv pip install pytorch_lightning --no-build-isolation
! uv pip install torch-cluster torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cu124.html --no-build-isolation
! uv pip install torch-geometric==2.6.1 --no-build-isolation

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m3 packages[0m [2min 97ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m7 packages[0m [2min 105ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 108ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m3 packages[0m [2min 100ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 95ms[0m[0m


In [3]:
! git clone https://github.com/jackie-illiilli/Alkene_Ozone_Predict

fatal: destination path 'Alkene_Ozone_Predict' already exists and is not an empty directory.


In [6]:
import sys
sys.path.append("/content/Alkene_Ozone_Predict")

## Import Package

In [7]:
import os, glob, shutil, pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
import torch
from DFTStructureGenerator import mol_manipulation, logfile_process, xtb_process, FormatConverter, Kwon
from TSDiff.Preprocess import preprocess as tsdiff_preprocess
from TSDiff.train import train as tsdiff_train
from TSDiff.sampling import sample as tsdiff_sample
from confrankplus.train import train as confrankplus_train
from confrankplus.train import test as confrankplus_test

## Run

In [None]:

ene_dict = {"Index": {index: index for index, ene in enumerate(smiles_list)},"Ene": {index: ene for index, ene in enumerate(smiles_list)},}
root_dir = f'/content/results/{project_name}'
df = pd.DataFrame(ene_dict, columns=['Index', 'Ene'])
if not os.path.exists(root_dir):
    os.makedirs(root_dir)
df.to_csv(f'{root_dir}/Final.csv', index=False)

In [None]:
gs_pack, gs2_pack, ts_pack, ts2_pack, restrict_dict, restrict_dict_2, new_df = Kwon.generate_mol_input(df, mol_dir = f'{root_dir}/Mols', save_mol=False, seed=3)
ts_mol_dict = {mol_name: mol for mol, mol_name  in zip(*ts_pack[:2])}
ts2_mol_dict = {mol_name: mol for mol, mol_name in zip(*ts2_pack[:2])}

In [None]:
# ts_mol_dict = {}
# ts2_mol_dict = {}
# for mol_file in glob.glob(f'data/{project_name}/Mols/*.mol'):
#     if 'ts1' in mol_file:
#         mol = Chem.MolFromMolFile(mol_file, removeHs=False)
#         mol_name = os.path.basename(mol_file).split('.mol')[0]
#         ts_mol_dict[mol_name] = mol
#     elif 'ts2' in mol_file:
#         mol = Chem.MolFromMolFile(mol_file, removeHs=False)
#         mol_name = os.path.basename(mol_file).split('.mol')[0]
#         ts2_mol_dict[mol_name] = mol

In [12]:
all_ts_guess = []
All_AAM = []
All_Name = []
for row_id, row in tqdm(new_df.iterrows()):
    Index, smiles, Site_A, Site_B, Z_Pos, Rot = int(row['Index']), row['Ene'], int(row['Site_A']), int(row['Site_B']), int(row['Z_Pos']), int(row['Rot'])
    AAMs = Kwon.smiles_to_AAM(smiles, return_mediate=True)
    for AAM in AAMs:
        if AAM[1][0] == Site_A and AAM[1][1] == Site_B:
            target_AAM = AAM[0]
            break
    reactant_mol = Chem.MolFromSmarts(target_AAM.split(">>")[0])
    reactant_atom_map_list = [atom.GetAtomMapNum() for atom in reactant_mol.GetAtoms()]
    reactant_atom_list = np.array([atom.GetSymbol() for atom in reactant_mol.GetAtoms()])[np.argsort(reactant_atom_map_list)]
    target_AAMs = [">>".join(target_AAM.split(">>")[:2]), ">>".join(target_AAM.split(">>")[1:])]
    for idx in range(2):
        mol_name = f'{['ts1', 'ts2'][idx]}_{Index:05}_{Site_A:03}_{Site_B:03}_{Z_Pos}_{Rot}'
        if idx == 0 and mol_name not in ts_mol_dict:
            continue
        ts_guess_mol = ts_mol_dict[mol_name] if idx == 0 else ts2_mol_dict[mol_name]
        ts_guess_position = np.array(ts_guess_mol.GetConformer().GetPositions())
        all_ts_guess.append({
            'name': mol_name,
            'atomlist': reactant_atom_list,
            'positions': ts_guess_position
        })
        All_AAM.append(target_AAMs[idx])
        All_Name.append(mol_name)

if not os.path.exists(f'{root_dir}/row'):
    os.makedirs(f'{root_dir}/row')
FormatConverter.write_xyz_file(f'{root_dir}/row/ts_guess.xyz', all_ts_guess)
new_df.to_csv(f'{root_dir}/row/Detail.csv', index=False)
new_df_ = pd.DataFrame({'Index': np.arange(len(All_AAM)), 'AAM': All_AAM, 'name': All_Name})
new_df_.to_csv(f'{root_dir}/row/Input.csv', index=False)

16it [00:00, 335.73it/s]


In [13]:
tsdiff_preprocess(os.path.join(root_dir, 'processed'),
                  os.path.join(root_dir, 'row', 'Input.csv'),
                  None,
                  os.path.join(root_dir, 'row', 'ts_guess.xyz'),
                  '/content/Alkene_Ozone_Predict/Trained_model/feat_dict.pkl',
                  None, None, np.arange(len(new_df_)))

  edge_index = torch.tensor([row, col], dtype=torch.long)
24it [00:00, 252.76it/s]

Preprocessing done. Train: 0, Valid: 0, Test: 24





In [14]:
sample_path = tsdiff_sample({
    "ckpt": '/content/Alkene_Ozone_Predict/Trained_model/tsdiff.ckpt',
    "test_set": os.path.join(root_dir, 'processed', 'test_data.pkl'),
    "feat_dict": os.path.join(root_dir, 'processed', 'feat_dict.pkl'),
    "save_dir": os.path.join(root_dir, 'processed'),
})

[2025-12-27 02:45:21,017::sample::INFO] Sampling arguments: {'ckpt': '/content/Alkene_Ozone_Predict/Trained_model/tsdiff.ckpt', 'device': 'cuda', 'batch_size': 500, 'resume': False, 'save_traj': False, 'save_dir': '/content/results/p-menth-2-ene/processed', 'feat_dict': '/content/results/p-menth-2-ene/processed/feat_dict.pkl', 'test_set': '/content/results/p-menth-2-ene/processed/test_data.pkl', 'start_idx': 0, 'end_idx': 99999, 'repeat': 1, 'from_ts_guess': True, 'denoise_from_time_t': 1500, 'noise_from_time_t': None, 'clip': 1000.0, 'n_steps': 1500, 'sampling_type': 'ld', 'eta': 1.0, 'step_lr': 1e-07, 'seed': 2022}
INFO:sample:Sampling arguments: {'ckpt': '/content/Alkene_Ozone_Predict/Trained_model/tsdiff.ckpt', 'device': 'cuda', 'batch_size': 500, 'resume': False, 'save_traj': False, 'save_dir': '/content/results/p-menth-2-ene/processed', 'feat_dict': '/content/results/p-menth-2-ene/processed/feat_dict.pkl', 'test_set': '/content/results/p-menth-2-ene/processed/test_data.pkl', 'sta

  0%|          | 0/1 [00:00<?, ?it/s]

Start with zero-noise
Denoise from t=1500 to t=0
Initial Position
tensor([[ 3.2178,  1.0352, -0.3402],
        [ 1.6956,  1.3761, -0.6322],
        [ 0.9433,  1.7774,  0.7067],
        ...,
        [-1.1525, -0.8364,  1.8397],
        [ 0.3633, -0.3711,  3.0958],
        [ 1.0970,  0.5970,  2.4473]], device='cuda:0')


sample: 0it [00:00, ?it/s]

  bgraph_adj = torch.sparse.LongTensor(edge_index, edge_type, torch.Size([N, N]))
[2025-12-27 02:46:39,310::sample::INFO] Sampling completed! Total 24 conformations saved to:
INFO:sample:Sampling completed! Total 24 conformations saved to:
[2025-12-27 02:46:39,312::sample::INFO]     /content/results/p-menth-2-ene/processed/samples_all.pkl
INFO:sample:    /content/results/p-menth-2-ene/processed/samples_all.pkl


Generated Position
tensor([[ 3.6368,  0.0700,  0.3109],
        [ 2.1530,  0.3795,  0.0763],
        [ 1.4455,  0.8354,  1.3645],
        ...,
        [-0.4393, -1.7336,  1.1790],
        [ 1.1680, -1.5067,  2.3250],
        [ 1.6443, -0.5154,  1.6021]], device='cuda:0')


In [15]:
train_data, val_data, test_data = [], [], []
with open(sample_path, "rb") as f:
    results = pickle.load(f)
df = pd.read_csv(os.path.join(root_dir, 'row', 'Input.csv'), index_col='Index')
for row_id, row in df.iterrows():
    each_result = results[row_id]
    rxn_index = each_result.rxn_index
    smiles = each_result.smiles
    AAM = row['AAM']
    assert AAM == smiles and rxn_index == row_id

    log_name = row['name'].split("_")
    confid = '_'.join(log_name[2:])
    ensbid = '_'.join(log_name[:2])
    charge = np.sum([each.GetFormalCharge() for each in each_result.rdmol[0].GetAtoms()])
    symbol_list = each_result.atom_type
    data = {
    'confid':confid,
    'ensbid':ensbid,
    'total_charge':torch.tensor(charge, dtype=torch.float32),
    'z':torch.tensor(symbol_list, dtype=torch.long),
    'pos':torch.tensor(each_result.ts_guess, dtype=torch.float32),
    }
    test_data.append(data)
    # raise NameError

if not os.path.exists(os.path.join(root_dir, 'cfrk')):
    os.makedirs(os.path.join(root_dir, 'cfrk'))
torch.save(test_data, os.path.join(root_dir, 'cfrk', 'test.pt'))

  'z':torch.tensor(symbol_list, dtype=torch.long),
  'pos':torch.tensor(each_result.ts_guess, dtype=torch.float32),


In [None]:
cfrk_pred = confrankplus_test(
        project_name=os.path.join(root_dir, 'cfrk'),
        best_ckpt_path='/content/Alkene_Ozone_Predict/Trained_model/confrank.ckpt',
        test_set = os.path.join(root_dir, 'cfrk', 'test.pt'),
        gpu_id=0,
    )

Using torch_cluster for computing neighborlists.


  data.z = torch.tensor(raw_dict['z'], dtype=torch.long)  # Or keep as data.z if not using as x
  data.pos = torch.tensor(raw_dict['pos'], dtype=torch.float)  # (num_atoms, 3)
  data.total_charge = torch.tensor(raw_dict['total_charge'], dtype=torch.float)


Prediction saved to /content/results/p-menth-2-ene/cfrk/Pred.pt


In [10]:
target_df_test = pd.read_csv(f'{root_dir}/row/Detail.csv')
pred_energies_dict = torch.load(cfrk_pred)
pred_TS = []
pred_TS2 = []
for row_id, row in target_df_test.iterrows():
    Index, Site_A, Site_B, Z_Pos, Rot = row['Index'], row['Site_A'], row['Site_B'], row['Z_Pos'], row['Rot']
    for idx in range(2):
        ts_guess_name = f'{['ts1', 'ts2'][idx]}_{Index:05}_{Site_A:03}_{Site_B:03}_{Z_Pos}_{Rot}'
        if idx == 0 and ts_guess_name not in pred_energies_dict.keys():
            ts_guess_name = f'{['ts1', 'ts2'][idx]}_{Index:05}_{Site_B:03}_{Site_A:03}_{Z_Pos}_{Rot}'
            if ts_guess_name not in pred_energies_dict.keys():
                if idx == 0:
                    pred_TS.append(np.nan)
                else:
                    pred_TS2.append(np.nan)
                continue
        if idx == 0:
            pred_TS.append(pred_energies_dict[ts_guess_name])
        else:
            pred_TS2.append(pred_energies_dict[ts_guess_name])
target_df_test['pred_TS'] = pred_TS
target_df_test['pred_TS2'] = pred_TS2
target_df_test.to_csv(f'{root_dir}/row/Detail.csv', index=False)

In [11]:
detail_df = pd.read_csv(f'{root_dir}/row/Detail.csv')
# detail_df['TS_G(kcal/mol)'] = 627.5 * detail_df['TS_G']
# detail_df['TS2_G(kcal/mol)'] = 627.5 * detail_df['TS2_G']
column_TS1_TS2 = [['TS_G(kcal/mol)', 'TS2_G(kcal/mol)', 'B-A Energy'], ['pred_TS', 'pred_TS2', 'B-A Energy_pred']]
# column_TS1_TS2 = [['pred_TS', 'pred_TS2', 'B-A Energy_pred']]
result_df = Kwon.calc_DDG(result_df=pd.read_csv(f'{root_dir}/Final.csv'),
                          detail_df=detail_df,
                          column_TS1_TS2= [['pred_TS', 'pred_TS2', 'B-A Energy_pred']],
                          banned_ene = [9])


In [12]:
result_df

Unnamed: 0,Index,Ene,A_site,B_site,B-A Energy_pred
0,0,C[C@@H]1CC[C@@H](C(C)C)C=C1,8,9,-0.818539
1,1,C[C@@H]1CC[C@H](C(C)C)C=C1,8,9,1.1213


# Read the Result here

In [None]:
from rdkit.Chem.Draw import rdMolDraw2D
opts = rdMolDraw2D.MolDrawOptions()
opts.annotationFontScale = 1
opts.setAtomNoteColour((1.0, 0.0, 0.0, 1.0))
all_mols = []
site_As = []
site_Bs = []
for row_id, row in result_df.dropna().iterrows():
    ene = row['Ene']
    pred_dG = row['B-A Energy_pred']
    A_site = int(row['A_site'])
    B_site = int(row['B_site'])
    mol = Chem.MolFromSmiles(ene)
    pred_dG = pred_dG * 0.38 
    ratio = Kwon.G_to_K(pred_dG)
    
    ratio_b = ratio / (1 + ratio) * 100
    ratio_a = 100 - ratio_b
    site_As.append(ratio_a)
    site_Bs.append(ratio_b)
    mol.GetAtomWithIdx(A_site).SetProp('atomNote', f'{ratio_a:.1f}')
    mol.GetAtomWithIdx(B_site).SetProp('atomNote', f'{ratio_b:.1f}')
    all_mols.append(mol)
result_df['Ratio_A'] = site_As
result_df['Ratio_B'] = site_Bs
result_df.to_csv(f'Data/{project_name}/Final.csv')
svgs = []
for indexs in np.arange(0, len(all_mols), 48):
    svgs.append(Chem.Draw.MolsToGridImage(all_mols[indexs:indexs + 48], molsPerRow=6, subImgSize=(200,200), useSVG = 1,drawOptions=opts))
for svg in svgs:
    display(svg)
