# Retrain empirical tree

In [16]:
import sys
sys.path.insert(0,"/home/gridsan/hwpang/Software/RMG-Py/")
sys.path.insert(0,"..")

import os
import json

import random
import time as time
import pandas as pd
import numpy as np
from pathlib import Path

from rmgpy.molecule.group import Group
from rmgpy.data.base import Entry
from rmgpy.data.thermo import ThermoData, ThermoDatabase

from tree.thermo import ThermoGroups, average_thermo_data
from tree.utils import make_mol
from tree.parameters import Ts

# Get data

In [17]:
hbi_unc_df = pd.read_csv("../data/hbi_unc.csv")
hbi_unc_df

Unnamed: 0,radical_smiles,resonance_radical_smiles,resonance_radical_num_rotatable_bonds,radical_H298 (kcal/mol),radical_Sint298 (cal/mol/K),radical_source,radical_level_of_theory,radical_Cp300 (cal/mol/K),radical_Cp400 (cal/mol/K),radical_Cp500 (cal/mol/K),...,unc_closed_shell_Cp1500 (cal/mol/K),unc_HBI_H298 (kcal/mol),unc_HBI_Sint298 (cal/mol/K),unc_HBI_Cp300 (cal/mol/K),unc_HBI_Cp400 (cal/mol/K),unc_HBI_Cp500 (cal/mol/K),unc_HBI_Cp600 (cal/mol/K),unc_HBI_Cp800 (cal/mol/K),unc_HBI_Cp1000 (cal/mol/K),unc_HBI_Cp1500 (cal/mol/K)
0,[O]C(=O)OC(O)(O)O,[O]C(=O)OC(O)(O)O,4,-223.514126,95.999974,dong_pio_liang.py,CBS-QB3,33.926878,38.387673,41.508496,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
1,[O]C(O)(O)OC(=O)O,[O]C(O)(O)OC(=O)O,3,-223.423390,92.781436,dong_pio_liang.py,CBS-QB3,32.531179,37.569659,41.498443,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2,O=C(O)O[C](O)O,O=C(O)O[C](O)O,3,-187.322869,89.517068,dong_pio_liang.py,CBS-QB3,28.973242,34.553073,38.746721,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3,CC(=O)OC(OO)C(=O)C(O)O[O],CC(=O)OC(OO)C(=O)C(O)O[O],8,-185.660577,132.633380,dong_pio_liang.py,CBS-QB3,56.569703,66.028226,73.603744,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4,O=[C]OC(O)(O)O,O=[C]OC(O)(O)O,5,-180.497563,90.031975,dong_pio_liang.py,CBS-QB3,32.378191,36.844343,39.334179,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2804,C1=C[CH]C=1,[C]1=CC=C1,0,156.106582,65.385768,dong_pio_liang.py,CBS-QB3,14.653111,18.488757,21.562144,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2805,C=C=C1C=[C]C1,C=C=C1C=[C]C1,0,157.733085,77.587625,dong_pio_liang.py,CBS-QB3,22.675255,28.333884,32.976246,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2806,[CH2]C1=CC#CC1,[CH2]C1=CC#CC1,1,163.498344,75.963844,dong_pio_liang.py,CBS-QB3,22.606638,28.495118,33.391100,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2807,[CH2]C1=CC#CC1,C=C1[CH]C#CC1,0,163.498344,75.963844,dong_pio_liang.py,CBS-QB3,22.606638,28.495118,33.391100,...,2.0,5.2,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [18]:
split = "cluster"

with open(f"../data/splits/{split}.json", "r") as f:
    train_inds, test_inds = json.load(f)

In [19]:
train_df = hbi_unc_df.loc[train_inds, :]

In [20]:
mols = train_df["resonance_radical_smiles"].apply(make_mol)
mols = mols.to_list()

In [21]:
HBI_corrections = train_df.apply(lambda x: ThermoData(
    H298=(x["HBI_H298 (kcal/mol)"], "kcal/mol", "+|-", x["unc_HBI_H298 (kcal/mol)"]),
    S298=(x["HBI_Sint298 (cal/mol/K)"], "cal/mol/K", "+|-", x["unc_HBI_Sint298 (cal/mol/K)"]),
    Cpdata=([x[f"HBI_Cp{T} (cal/mol/K)"] for T in Ts], "cal/mol/K", "+|-", [x[f"unc_HBI_Cp{T} (cal/mol/K)"] for T in Ts]),
    Tdata=(Ts, "K"),
    comment=f"Radical thermo from {x['radical_source']} and closed shell thermo from {x['closed_shell_thermo_source'].replace('Thermo library: ../', '')}"
), axis=1)
HBI_corrections = HBI_corrections.to_list()

In [22]:
mols_corrections_all = list(zip(mols, HBI_corrections))

# Generate tree

In [23]:
n_jobs = 1
mols_corrections = mols_corrections_all

In [24]:
thermo_database = ThermoDatabase()

In [25]:
tree = ThermoGroups().load("/home/gridsan/hwpang/Software/RMG-database/input/thermo/groups/radical.py", thermo_database.local_context, thermo_database.global_context)

start = time.time()
template_mol_map = tree.get_molecule_matches(mols_corrections=mols_corrections,
                                                     exact_matches_only=False, n_jobs=n_jobs)
end = time.time()
print("Mol mapping:")
print(end-start)

# tree.regularize(template_mol_map)

start = time.time()
tree.make_corrections_from_template_mol_map(template_mol_map, n_jobs=n_jobs)
end = time.time()
print("Make corrections:")
print(end-start)

# tree.check_tree()

# def add_children(old_node, new_node):
#     for old_child in old_node.children:
#         tree.entries[old_child.label] = deepcopy(old_child)
#         tree.entries[old_child.label].parent = new_node
#         new_node.children.append(tree.entries[old_child.label])
#         add_children(old_child, tree.entries[old_child.label])
# tree.entries["RJ2_triplet"] = deepcopy(thermo_database.groups["radical"].entries["RJ2_triplet"])
# tree.entries["RJ2_triplet"].parent = tree.entries["Root"]
# tree.entries["Root"].children.append(tree.entries["RJ2_triplet"])
# add_children(thermo_database.groups["radical"].entries["RJ2_triplet"], tree.entries["RJ2_triplet"])

# tree.entries["RJ3"] = deepcopy(thermo_database.groups["radical"].entries["RJ3"])
# tree.entries["RJ3"].parent = tree.entries["Root"]
# tree.entries["Root"].children.append(tree.entries["RJ3"])
# add_children(thermo_database.groups["radical"].entries["RJ3"], tree.entries["RJ3"])

tree.save(f"../models/split-{split}_run-retrain_empirical/tree.py")

INFO:root:Getting molecule matches...
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 2470 out of 2470 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s


Mol mapping:
71.4522020816803


[Parallel(n_jobs=1)]: Done 1994 out of 1994 | elapsed:    0.9s finished


Make corrections:
1.7062225341796875
