# Get corresponding closed shells and data

In [1]:
import sys
sys.path.insert(0,"/home/gridsan/hwpang/Software/RMG-Py/")
sys.path.insert(0,"..")

import random
import os
import time
import math
from copy import deepcopy
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm

from rmgpy.data.thermo import ThermoDatabase, ThermoLibrary, ThermoData, remove_thermo_data, add_thermo_data, NASA
from rmgpy.molecule import Molecule
from rmgpy.species import Species
from rmgpy import settings
from rmgpy import constants

from tree.utils import load_thermo_lib_by_path, generate_thermo
from tree.parameters import Ts



# Load resonance radicals

In [2]:
resonance_radical_df = pd.read_csv("../data/resonance_radicals.csv")

# Generate closed shell structures

In [3]:
closed_shell_df = resonance_radical_df[["resonance_radical_smiles"]]

In [4]:
def generate_closed_shell_smi(smi):
    mol = Molecule().from_smiles(smi)
    mol.saturate_radicals()
    mol.atoms.sort()
    return mol.to_smiles()

generate_closed_shell_smi("CC1OC1OC[O]")

'CC1OC1OCO'

In [5]:
closed_shell_df["closed_shell_smiles"] = closed_shell_df["resonance_radical_smiles"].apply(generate_closed_shell_smi)
closed_shell_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,resonance_radical_smiles,closed_shell_smiles
0,[O]C(=O)OC(O)(O)O,O=C(O)OC(O)(O)O
1,[O]C(O)(O)OC(=O)O,O=C(O)OC(O)(O)O
2,O=C(O)O[C](O)O,O=C(O)OC(O)O
3,CC(=O)OC(OO)C(=O)C(O)O[O],CC(=O)OC(OO)C(=O)C(O)OO
4,O=[C]OC(O)(O)O,O=COC(O)(O)O
...,...,...
2800,[C]1=CC=C1,C1=CC=C1
2801,C=C=C1C=[C]C1,C=C=C1C=CC1
2802,[CH2]C1=CC#CC1,CC1=CC#CC1
2803,C=C1[CH]C#CC1,C=C1CC#CC1


In [6]:
closed_shell_df.to_csv("../data/closed_shells.csv", index=False)

# Generate closed shell thermo

In [7]:
lib_paths = [
    "../data/dong_pio_liang.py",
    "../data/pang.py",
    "../data/johnson_cbs-qb3.py",
    "../data/johnson_g4.py",
]

In [8]:
# load thermo database

thermo_database = ThermoDatabase()
thermo_database.load_groups(os.path.join(settings["database.directory"], "thermo", "groups"))
for path in lib_paths:
    load_thermo_lib_by_path(path, thermo_database)
    thermo_database.library_order.remove(path)
thermo_database.library_order = lib_paths + thermo_database.library_order

The thermodynamics library ../data/dong_pio_liang.py is loaded.
The thermodynamics library ../data/pang.py is loaded.
The thermodynamics library ../data/johnson_cbs-qb3.py is loaded.
The thermodynamics library ../data/johnson_g4.py is loaded.


In [9]:
# takes ~1 min 30 sec
closed_shell_thermos = [generate_thermo(thermo_database, smi, resonance=False) for smi in tqdm(closed_shell_df["closed_shell_smiles"])]


100%|██████████| 2805/2805 [00:37<00:00, 73.92it/s] 


In [10]:
closed_shell_df["closed_shell_H298 (kcal/mol)"] = [thermo[0] for thermo in closed_shell_thermos]
closed_shell_df["closed_shell_S298 (cal/mol/K)"] = [thermo[1] for thermo in closed_shell_thermos]
for i, T in enumerate(Ts):
    closed_shell_df[f"closed_shell_Cp{T} (cal/mol/K)"] = [thermo[2][i] for thermo in closed_shell_thermos]
closed_shell_df["closed_shell_Sint298 (cal/mol/K)"] = [thermo[3] for thermo in closed_shell_thermos]
closed_shell_df["closed_shell_thermo_comment"] = [thermo[4] for thermo in closed_shell_thermos]
closed_shell_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,resonance_radical_smiles,closed_shell_smiles,closed_shell_H298 (kcal/mol),closed_shell_S298 (cal/mol/K),closed_shell_Cp300 (cal/mol/K),closed_shell_Cp400 (cal/mol/K),closed_shell_Cp500 (cal/mol/K),closed_shell_Cp600 (cal/mol/K),closed_shell_Cp800 (cal/mol/K),closed_shell_Cp1000 (cal/mol/K),closed_shell_Cp1500 (cal/mol/K),closed_shell_Sint298 (cal/mol/K),closed_shell_thermo_comment
0,[O]C(=O)OC(O)(O)O,O=C(O)OC(O)(O)O,-272.617591,98.406448,31.562314,36.827897,41.095602,44.234417,48.821415,51.772505,56.215621,100.589618,Thermo group additivity estimation: group(O2s-...
1,[O]C(O)(O)OC(=O)O,O=C(O)OC(O)(O)O,-272.617591,98.406448,31.562314,36.827897,41.095602,44.234417,48.821415,51.772505,56.215621,100.589618,Thermo group additivity estimation: group(O2s-...
2,O=C(O)O[C](O)O,O=C(O)OC(O)O,-231.363843,94.127419,27.214474,31.965583,36.032333,39.309560,44.059044,47.141300,50.458700,94.127419,Thermo group additivity estimation: group(O2s-...
3,CC(=O)OC(OO)C(=O)C(O)O[O],CC(=O)OC(OO)C(=O)C(O)OO,-234.219312,145.598548,52.706616,62.438776,71.442505,78.070363,87.443365,95.103078,102.679981,145.026864,Thermo group additivity estimation: group(O2s-...
4,O=[C]OC(O)(O)O,O=COC(O)(O)O,-216.499044,92.684651,26.935163,30.962696,34.637667,37.592447,42.375430,45.749560,50.979006,94.867820,Thermo group additivity estimation: group(O2s-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2800,[C]1=CC=C1,C1=CC=C1,104.333500,57.818020,15.504000,19.920000,23.474000,26.261000,30.329000,33.199000,37.574000,61.950300,Thermo group additivity estimation: group(Cds-...
2801,C=C=C1C=[C]C1,C=C=C1C=CC1,91.366874,50.394958,22.416919,29.489266,35.000461,39.185145,45.162263,49.240044,55.323950,51.772385,Thermo group additivity estimation: group(Cs-(...
2802,[CH2]C1=CC#CC1,CC1=CC#CC1,131.227000,72.956831,23.409000,30.017000,34.619000,39.828000,46.529000,51.356000,56.349000,75.140000,Thermo group additivity estimation: group(Cs-(...
2803,C=C1[CH]C#CC1,C=C1CC#CC1,136.767000,72.435147,22.819000,29.017000,34.189000,38.448000,45.419000,50.576000,55.959000,75.190000,Thermo group additivity estimation: group(Cs-(...


In [11]:
def thermo_source(comment):
    if "Thermo group additivity estimation" in comment:
        return "GAV"
    else:
        return comment
closed_shell_df["closed_shell_thermo_source"] = closed_shell_df["closed_shell_thermo_comment"].apply(thermo_source)
closed_shell_df

Unnamed: 0,resonance_radical_smiles,closed_shell_smiles,closed_shell_H298 (kcal/mol),closed_shell_S298 (cal/mol/K),closed_shell_Cp300 (cal/mol/K),closed_shell_Cp400 (cal/mol/K),closed_shell_Cp500 (cal/mol/K),closed_shell_Cp600 (cal/mol/K),closed_shell_Cp800 (cal/mol/K),closed_shell_Cp1000 (cal/mol/K),closed_shell_Cp1500 (cal/mol/K),closed_shell_Sint298 (cal/mol/K),closed_shell_thermo_comment,closed_shell_thermo_source
0,[O]C(=O)OC(O)(O)O,O=C(O)OC(O)(O)O,-272.617591,98.406448,31.562314,36.827897,41.095602,44.234417,48.821415,51.772505,56.215621,100.589618,Thermo group additivity estimation: group(O2s-...,GAV
1,[O]C(O)(O)OC(=O)O,O=C(O)OC(O)(O)O,-272.617591,98.406448,31.562314,36.827897,41.095602,44.234417,48.821415,51.772505,56.215621,100.589618,Thermo group additivity estimation: group(O2s-...,GAV
2,O=C(O)O[C](O)O,O=C(O)OC(O)O,-231.363843,94.127419,27.214474,31.965583,36.032333,39.309560,44.059044,47.141300,50.458700,94.127419,Thermo group additivity estimation: group(O2s-...,GAV
3,CC(=O)OC(OO)C(=O)C(O)O[O],CC(=O)OC(OO)C(=O)C(O)OO,-234.219312,145.598548,52.706616,62.438776,71.442505,78.070363,87.443365,95.103078,102.679981,145.026864,Thermo group additivity estimation: group(O2s-...,GAV
4,O=[C]OC(O)(O)O,O=COC(O)(O)O,-216.499044,92.684651,26.935163,30.962696,34.637667,37.592447,42.375430,45.749560,50.979006,94.867820,Thermo group additivity estimation: group(O2s-...,GAV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2800,[C]1=CC=C1,C1=CC=C1,104.333500,57.818020,15.504000,19.920000,23.474000,26.261000,30.329000,33.199000,37.574000,61.950300,Thermo group additivity estimation: group(Cds-...,GAV
2801,C=C=C1C=[C]C1,C=C=C1C=CC1,91.366874,50.394958,22.416919,29.489266,35.000461,39.185145,45.162263,49.240044,55.323950,51.772385,Thermo group additivity estimation: group(Cs-(...,GAV
2802,[CH2]C1=CC#CC1,CC1=CC#CC1,131.227000,72.956831,23.409000,30.017000,34.619000,39.828000,46.529000,51.356000,56.349000,75.140000,Thermo group additivity estimation: group(Cs-(...,GAV
2803,C=C1[CH]C#CC1,C=C1CC#CC1,136.767000,72.435147,22.819000,29.017000,34.189000,38.448000,45.419000,50.576000,55.959000,75.190000,Thermo group additivity estimation: group(Cs-(...,GAV


In [14]:
closed_shell_df.to_csv("../data/closed_shell_data.csv", index=False)

# Get stats

In [12]:
closed_shell_df["closed_shell_thermo_source"].value_counts()

GAV                                           1824
Thermo library: ../data/dong_pio_liang.py      873
Thermo library: ../data/pang.py                 81
Thermo library: ../data/johnson_g4.py           22
Thermo library: ../data/johnson_cbs-qb3.py       5
Name: closed_shell_thermo_source, dtype: int64

In [16]:
873+81+5

959

In [15]:
sum(closed_shell_df["closed_shell_thermo_source"].value_counts())

2805