# Properties Extraction

Start by setting up some logging

In [1]:
import logging

logger = logging.getLogger('pubchem_api_crawler')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(ch)

### Get lists of matching compounds

Create a `MolecularFormulaSearch` object to search for compounds matching our queries.

In [2]:
from pubchem_api_crawler.molecular_search import MolecularFormulaSearch
mf = MolecularFormulaSearch()

Search for compounds made up of `C-H-Al`, `C-H-B`, `C-H-Mg`, `C-H-Al-B`, `C-H-Al-Mg`, `C-H-B-Mg`, `C-H-B-Al-Mg`.

In [None]:
chal = mf.search(["C1-500", "H1-500", "Al1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chb = mf.search(["C1-500", "H1-500", "B1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chmg = mf.search(["C1-500", "H1-500", "Mg1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chalb = mf.search(["C1-500", "H1-500", "Al1-500", "B1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chalmg = mf.search(["C1-500", "H1-500", "Al1-500", "Mg1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chbmg = mf.search(["C1-500", "H1-500", "B1-500", "Mg1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])
chbalmg = mf.search(["C1-500", "H1-500", "B1-500", "Al1-500", "Mg1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"])

For `C-H` compounds, use an async search to be able to get all results.

In [None]:
ch = mf.search(["C1-500", "H1-500"], allow_other_elements=False, properties=["IUPACName", "MolecularFormula", "MolecularWeight", "CanonicalSMILES"], _async=True)

Let's save our dataframes to csv files.

In [3]:
from pathlib import Path

output_dir = Path() / "output"
output_dir.mkdir(parents=True, exist_ok=True)

In [4]:
chal.to_csv(output_dir / "CHAl.csv", index=False)
chb.to_csv(output_dir / "CHB.csv", index=False)
chmg.to_csv(output_dir / "CHMg.csv", index=False)
chalb.to_csv(output_dir / "CHAlB.csv", index=False)
chalmg.to_csv(output_dir / "CHAlMg.csv", index=False)
chbmg.to_csv(output_dir / "CHBMg.csv", index=False)
ch.to_csv(output_dir / "CH.csv", index=False)

NameError: name 'chal' is not defined

In [5]:
import pandas as pd

chal = pd.read_csv(output_dir / "CHAl.csv")
chb = pd.read_csv(output_dir / "CHB.csv")
chmg = pd.read_csv(output_dir / "CHMg.csv")
chalb = pd.read_csv(output_dir / "CHAlB.csv")
chalmg = pd.read_csv(output_dir / "CHAlMg.csv")
chbmg = pd.read_csv(output_dir / "CHBMg.csv")
ch = pd.read_csv(output_dir / "CH.csv")

### Get the experimental properties for our matching compounds

For small result sets, we can loop through all the elements and get their experimental properties.

In [6]:
from pubchem_api_crawler.annotations import Annotations
an = Annotations()

In [None]:
chbmg_props = an.get_compound_annotations(chbmg)
chalmg_props = an.get_compound_annotations(chalmg)
chalb_props = an.get_compound_annotations(chalb)

PubChem doesn't have any annotations for these compounds... And for the other groups, there are too many compounds to fetch all their annotations one by one. Instead, we'll select some annotations of interest and fetch all the data PubChem has for them.

In [7]:
HEADINGS = [
    'Boiling Point',
    'Melting Point',
    'Flash Point',
#    'Solubility',
    'Density',
    'Vapor Pressure',
    'Autoignition Temperature',
    'Viscosity',
    'Heat of Combustion',
    'Heat of Vaporization',
    'Critical Temperature & Pressure',
    'Toxicity Data',
    'Other Experimental Properties',
]

In [8]:
from string import ascii_letters
import pandas as pd

def get_annotations_for_headings(headings: list[str]) -> dict[str, pd.DataFrame]:
    annotations = {}
    for heading in HEADINGS:
        fname = output_dir / ("".join([w[0] for w in heading.split() if w[0] in ascii_letters]) + "_props.csv")
        if fname.exists():
            annotations[heading] = pd.read_csv(fname)
        else:
            df = an.get_annotations(heading)
            df.to_csv(fname, index=False)
            annotations[heading] = df
        prefix = "_".join(heading.split()) + "_"
        annotations[heading].columns = (prefix + c if c != "CID" else c for c in annotations[heading].columns)
    
    return annotations

In [9]:
annotations = get_annotations_for_headings(HEADINGS)

In [10]:
def merge_annotations(compounds: pd.DataFrame, annotations: dict[str, pd.DataFrame]):
    annotated = None
    for annotation in annotations.values():
        if annotated is None:
            annotated = compounds.merge(annotation, how="left", on="CID")
        else:
            annotated = annotated.merge(annotation, how="left", on="CID")

    # keep only rows with at least one non null annotated property
    annotated = annotated[~annotated[annotated.columns.difference(['CID', 'MolecularFormula', "MolecularWeight", "CanonicalSMILES", "IUPACName"])].isnull().all(axis=1)]

    return annotated

In [16]:
chal_annotated = merge_annotations(chal, annotations)
chb_annotated = merge_annotations(chb, annotations)
chmg_annotated = merge_annotations(chmg, annotations)
ch_annotated = merge_annotations(ch, annotations)

In [None]:
pd.set_option('display.max_columns', None)

chmg_annotated

In [15]:
chal_annotated["MolecularFormula"].drop_duplicates()

0       C12H27Al
320      C6H15Al
640       C3H9Al
641     C30H63Al
644     C48H99Al
647     C36H75Al
650     C42H87Al
653    C60H123Al
657     C18H39Al
661     C24H51Al
Name: MolecularFormula, dtype: object

In [18]:
ch_annotated["MolecularFormula"].drop_duplicates()

263351     C15H24
269779     C13H12
269786     C16H24
269789     C40H58
270683     C33H68
            ...  
5558078      C2H6
5561319    C20H16
5562072    C15H18
5564076    C21H16
6127144       CH4
Name: MolecularFormula, Length: 149, dtype: object