# Predict

In [1]:
import sys
sys.path.insert(0,"/home/gridsan/hwpang/Software/RMG-Py/")
sys.path.insert(0,"..")

import json
import pandas as pd

from rmgpy.data.thermo import ThermoDatabase, ThermoGroups

from tree.thermo import ThermoGroups as SIDTThermoGroups
from tree.utils import make_mol
from tree.parameters import Ts



# Load test data

In [2]:
hbi_unc_df = pd.read_csv("../data/hbi_unc.csv")

In [3]:
with open("../data/splits/random.json", "r") as f:
    train_inds, test_inds = json.load(f)

In [4]:
test_df = hbi_unc_df.loc[test_inds, :]
test_df

Unnamed: 0,radical_smiles,resonance_radical_smiles,resonance_radical_num_rotatable_bonds,radical_H298 (kcal/mol),radical_Sint298 (cal/mol/K),radical_source,radical_level_of_theory,radical_Cp300 (cal/mol/K),radical_Cp400 (cal/mol/K),radical_Cp500 (cal/mol/K),...,unc_closed_shell_Cp1500 (cal/mol/K),unc_HBI_H298 (kcal/mol),unc_HBI_Sint298 (cal/mol/K),unc_HBI_Cp300 (cal/mol/K),unc_HBI_Cp400 (cal/mol/K),unc_HBI_Cp500 (cal/mol/K),unc_HBI_Cp600 (cal/mol/K),unc_HBI_Cp800 (cal/mol/K),unc_HBI_Cp1000 (cal/mol/K),unc_HBI_Cp1500 (cal/mol/K)
14,[O]C(=O)OC(=O)O,[O]C(=O)OC(=O)O,0,-164.838453,87.670293,dong_pio_liang.py,CBS-QB3,23.387634,26.613081,29.284902,...,1.0,1.697056,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214
15,CC(=O)COC(O[O])OC(C)=O,CC(=O)COC(O[O])OC(C)=O,7,-164.222931,134.520962,dong_pio_liang.py,CBS-QB3,51.800544,60.936453,68.795453,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
18,[O]OC(=O)OC(=O)O,[O]OC(=O)OC(=O)O,0,-155.105883,93.562651,dong_pio_liang.py,CBS-QB3,29.543449,33.217979,35.694866,...,1.0,1.697056,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214
25,C[C]1OC(=O)C(O)(C=O)O1,C[C]1OC(=O)C(O)(C=O)O1,3,-151.205886,98.996040,dong_pio_liang.py,CBS-QB3,38.751751,46.650997,52.369635,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
36,CC1(OO)COC(O[O])C(=O)O1,CC1(OO)COC(O[O])C(=O)O1,4,-137.513199,113.097378,dong_pio_liang.py,CBS-QB3,46.328845,56.831030,65.250112,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2748,C#CC1([CH2])OO1,C#CC1([CH2])OO1,1,96.427971,79.485861,dong_pio_liang.py,CBS-QB3,24.423412,29.090718,32.193888,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
2769,C#CCC=[C]C,C#CCC=[C]C,2,116.422647,89.067407,dong_pio_liang.py,CBS-QB3,26.955486,32.081066,36.841331,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
2772,CC=C=C1[CH]C1,CC=C=C1[CH]C1,1,119.785415,85.628683,dong_pio_liang.py,CBS-QB3,26.996656,32.629649,37.664892,...,2.0,4.176123,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068,2.236068
2773,CC=C=C1[CH]C1,CC=[C]C1=CC1,2,119.785415,85.628683,dong_pio_liang.py,CBS-QB3,26.996656,32.629649,37.664892,...,1.0,1.697056,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214,1.414214


In [5]:
mols = test_df["resonance_radical_smiles"].apply(make_mol)
mols = mols.to_list()

# Load thermo database

In [6]:
def make_prediction(thermo_database, tree, mol):
    atoms = {"*": atom for atom in mol.atoms if atom.radical_electrons==1}
    return thermo_database._add_group_thermo_data(None, tree, mol, atoms)[0]

In [7]:
thermo_database = ThermoDatabase()

# Predict with SIDT tree

In [8]:
model_dir = "../models/split-random_run-retrain-empirical"
sidt_tree = SIDTThermoGroups().load(f"{model_dir}/tree.py", thermo_database.local_context, thermo_database.global_context)

In [9]:
thermos = [make_prediction(thermo_database, sidt_tree, mol) for mol in mols]

In [10]:
test_result_df = test_df[["resonance_radical_smiles"]]
test_result_df["HBI_H298 (kcal/mol)"] = [thermo.H298.value_si/4184 for thermo in thermos]
test_result_df["unc_HBI_H298 (kcal/mol)"] = [thermo.H298.uncertainty_si/4184 for thermo in thermos]
test_result_df["HBI_Sint298 (cal/mol/K)"] = [thermo.S298.value_si/4.184 for thermo in thermos]
test_result_df["unc_HBI_Sint298 (cal/mol/K)"] = [thermo.S298.uncertainty_si/4.184 for thermo in thermos]
for i, T in enumerate(Ts):
    test_result_df[f"HBI_Cp{T} (cal/mol/K)"] = [thermo.Cpdata.value_si[i]/4.184 for thermo in thermos]
    test_result_df[f"unc_HBI_Cp{T} (cal/mol/K)"] = [thermo.Cpdata.uncertainty_si[i]/4.184 for thermo in thermos]
test_result_df["comment"] = [thermo.comment for thermo in thermos]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [11]:
split = "random"
run_name = "test"
test_result_df.to_csv(f"{model_dir}/test.csv", index=False)