# Program to take all processed datasets and combine them

Version: 0.1.0

0.1.0: Remove CMC data, add data source col

## Importing Libraries

In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import sys
sys.path.append('../')
from MiscScripts.isomerCount import isomerCountMain
warnings.filterwarnings("ignore")

## Melting/Boiling/Decomposition Point Data

In [11]:
williams = pd.read_csv("../Data/Processed/0.0.0-Williams.csv") #Importing the data
williams = williams[["Compound", "SMILES", "InChI", "AverageValue"]] #Selecting columns
williams = williams.rename(columns={"AverageValue": "MeltingPoint/C"}) #Renaming columns

williams["MeltingPoint-dataSource"] = "A. Williams et al, 2015"

In [12]:
wiki = pd.read_csv("../Data/Processed/0.0.0-WikiBP.csv") #Importing the data
wiki = wiki[["Compound", "canonical_smiles", "InChI", "bp", "mp"]] #Selecting columns
wiki = wiki.rename(columns={"mp": "MeltingPoint/C", "bp": "BoilingPoint/C", "canonical_smiles": "SMILES"}) #Renaming columns

wiki["Melting/BoilingPoint-dataSource"] = "WikiData, 2024"

In [13]:
pyrolysis = pd.read_csv("../Data/Processed/0.0.0-pyrolysis.csv") #Importing the data
pyrolysisMP = pyrolysis[pyrolysis["QuantityType"] == "MeltingPoint"] #Selecting melting point only
pyrolysisMP = pyrolysisMP[["Compound", "SMILES", "InChI", "AverageValue"]] #Selecting columns
pyrolysisMP = pyrolysisMP.rename(columns={"AverageValue": "MeltingPoint/C"}) #Renaming columns

pyrolysisMP["MeltingPoint-dataSource"] = "A. Williams et al, 2015"

In [14]:
bioquest = pd.read_csv("../Data/Processed/0.3.1-bioquest.csv") #Importing the data
bioquest = bioquest[["Compound", "SMILES", "InChI", "MeltingPoint/C", "BoilingPoint/C"]] #Selecting columns

bioquest["Melting/BoilingPoint-dataSource"] = "BioQuest, 2024"

## Solubility

In [15]:
solubility = pd.read_csv("../Data/Processed/0.2.0-solubility.csv") #Importing the data
print(solubility.columns)
solubility = solubility[["Compound", "SMILES", "InChI", "logS", "Temperature"]] #Selecting columns

solubility["logS-dataSource"] = "AqSolDB, IUPAC SDS, DDB, 2023"

Index(['Compound', 'SMILES', 'Temperature', 'InChI', 'logS'], dtype='object')


In [16]:
lowe = pd.read_csv("../Data/Processed/0.0.0-LoweSol.csv") #Importing the data

lowe = lowe[["Compound", "SMILES", "InChI", "logS", "Temperature"]] #Selecting columns

lowe["logS-dataSource"] = "Lowe et al, 2023"

## Henry's Law

In [17]:
hl = pd.read_csv("../Data/Processed/2.5.0-HenrysLaw.csv") #Importing the data
print(hl.columns)
hl = hl[["Compound", "SMILES", "Temperature", "InChI", "Constants"]] #Selecting columns
hl = hl.rename(columns={"Constants": "HenryConstant"}) #Renaming columns

hl["HenryConstant-dataSource"] = "https://doi.org/10.5194/acp-23-10901-2023"

Index(['Compound', 'CAS', 'Constants', 'Temperature', 'SMILES', 'InChI'], dtype='object')


In [20]:
inchiList = williams["InChI"].values.tolist() + wiki["InChI"].values.tolist() + pyrolysisMP["InChI"].values.tolist() + bioquest["InChI"].values.tolist() + solubility["InChI"].values.tolist() + lowe["InChI"].values.tolist()
inchiList = list(set(inchiList)) 
print(len(inchiList)) #Number of unique inchi values

master = pd.DataFrame() #Creating an empty dataframe
for inchi in tqdm(inchiList): #Iterating over all inchi values, selecting subset with InChI and concatenating to main dataframe
    subWilliams = williams[williams["InChI"] == inchi]
    subWiki = wiki[wiki["InChI"] == inchi]
    subPyrolysisMP = pyrolysisMP[pyrolysisMP["InChI"] == inchi]
    subBioquest = bioquest[bioquest["InChI"] == inchi]
    subSolubility = solubility[solubility["InChI"] == inchi]
    subHl = hl[hl["InChI"] == inchi]
    subLowe = lowe[lowe["InChI"] == inchi]

    sub = pd.concat([subWilliams, subWiki, subPyrolysisMP, subBioquest, subSolubility, subHl], axis=0)
    master = pd.concat([master, sub], axis=0)

df = isomerCountMain(master) #Getting isomer count
master.to_csv("../Data/Combined/LargeFiles/0.3.0-Master.csv", index=False) #Saving the data to a csv file

259967


  1%|          | 2124/259967 [01:29<3:01:34, 23.67it/s]


KeyboardInterrupt: 