In [1]:
from thermo.datasheet import tabulate_constants, tabulate_gas, tabulate_liq, tabulate_solid
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

import sys
sys.path.append('../')
from DatasetScripts.MiscScripts.dfToDict import dfToDict

# https://thermo.readthedocs.io/_modules/thermo/datasheet.html#tabulate_streams

In [2]:
def fetchData(functions, inchi):
    """ 
    Fetches data from a list of functions and combines them into a single dataframe.
    
    Parameters:
    functions (list): List of functions to use to fetch data.
    inchi (str): InChI string of the compound to fetch data for.
    
    Returns:
    combinedDF (DataFrame): DataFrame containing data from all functions.
    """
    dfList = [] # List to store dataframes from each function

    for func in functions: # Iterate through each function and fetch data
        try:
            df = func(inchi)
            df.reset_index(inplace=True)
            dfList.append(df)
        except:
            # print(f"Error fetching data from {func.__name__}")
            pass
    
    combinedDF = pd.concat(dfList, axis=0) # Combine all dataframes into a single dataframe
    combinedDF.rename(columns={"index": "Compound"}, inplace=True)
    combinedDF.insert(0, "InChI", inchi)

    for col in combinedDF.columns: # Replace "None" with NaN
        combinedDF[col].replace("None", np.nan, inplace=True)

        values = combinedDF[col].values.astype(str)
        uniqueValues = np.unique(values)
        if len(uniqueValues) == 2: # If there are only two unique values, replace the one that is not NaN with NaN
            combinedDF[col] = combinedDF[col].ffill()

    combinedDF.drop([0,0]) # Drop the first row, which is the InChI string
    return combinedDF

In [3]:
data = pd.read_csv("../Data/Combined/0.4.0-CleanedMaster.csv") # Load data
inchiList = data["InChI"].values
inchiList = np.unique(inchiList)
vers = "0.4.0"
errors = 0
try: # Check if features have already been calculated
    features = pd.read_csv(f"../Data/Combined/LargeFiles/ExtractedDescriptors/{vers}-hT_ThermoDesc.csv")
    print("Existing features found and loaded.")
except FileNotFoundError:  # If features have not been calculated, calculate them
    functions = [tabulate_constants, tabulate_gas, tabulate_liq, tabulate_solid] # List of functions to use to fetch data
    features = pd.DataFrame() # Dataframe to store features

    for inchi in tqdm(inchiList, desc="Calculating thermo features"): # Iterate through each InChI string and fetch data
        try:
            fetched = fetchData(functions, inchi) # Fetch data
            features = pd.concat([features, fetched], axis=0)
        except:
            errors += 1

    features["T, K"] = features["T, K"].astype(float) - 273.15 # Convert temperature from K to C
    features.rename(columns={"T, K": "Temperature", # Rename columns to match the rest of the data
                             "MW, g/mol" : "MolWt"}, inplace=True)

    features.reset_index(inplace=True, drop=True)
    features.to_csv(f"../Data/Combined/LargeFiles/ExtractedDescriptors/{vers}-ThermoDesc.csv", index=False)
print(errors)

Calculating thermo features:   9%|▉         | 1289/13640 [00:38<05:31, 37.22it/s]

[-0.8499999999999943, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']
[-25.849999999999994, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  10%|█         | 1389/13640 [00:42<04:37, 44.07it/s]

[-470.2919888888889, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']
[-470.2919888888889, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  15%|█▍        | 2041/13640 [01:07<09:14, 20.93it/s]

[-16.189999999999998, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  61%|██████    | 8285/13640 [04:31<06:06, 14.62it/s] 

[-9.700000000000003, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  61%|██████    | 8351/13640 [04:35<06:22, 13.83it/s]

[-25.849999999999994, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  63%|██████▎   | 8652/13640 [04:47<04:27, 18.67it/s]

[-11.924999999999997, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  64%|██████▍   | 8763/13640 [04:55<04:20, 18.69it/s]

[-14.5, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  68%|██████▊   | 9210/13640 [05:18<04:26, 16.64it/s]

[-12.150000000000006, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  70%|███████   | 9585/13640 [05:41<03:56, 17.12it/s]

[-22.849999999999994, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  99%|█████████▉| 13483/13640 [09:07<00:11, 13.94it/s]

[-7.849999999999994, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  99%|█████████▉| 13489/13640 [09:08<00:12, 12.45it/s]

[-10.650000000000006, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  99%|█████████▉| 13528/13640 [09:11<00:06, 16.02it/s]

[-9.25, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features:  99%|█████████▉| 13570/13640 [09:14<00:05, 12.96it/s]

[-31.849999999999994, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|█████████▉| 13590/13640 [09:16<00:03, 14.55it/s]

[-33.64, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|█████████▉| 13596/13640 [09:16<00:03, 11.72it/s]

[-85.99, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']
[-86.01, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']
[-81.27, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|█████████▉| 13614/13640 [09:18<00:02, 12.31it/s]

[-99.05, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|█████████▉| 13624/13640 [09:19<00:01, 11.63it/s]

[-36.85, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|█████████▉| 13630/13640 [09:19<00:00, 14.19it/s]

[-75.44, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']
[-45.64, 101325, 0.0, 0.0, 0.0, 0.0, 'coordinates of failure']


Calculating thermo features: 100%|██████████| 13640/13640 [09:20<00:00, 24.34it/s]


5402
