In [1]:
# | default_exp input


# Data augmentation


We can use SMILES randomization for data augmentation. We use code by [Schwaller et al.](https://github.com/rxn4chemistry/rxn_yields).


In [1]:
# | export
import random
from typing import List

from rdkit import Chem

In [2]:
# | export
def randomize_smiles(
    smiles: str,
    random_type: str = "rotated",  #  The type (unrestricted, restricted, rotated) of randomization performed.
    isomericSmiles: bool = True,
):
    """
    From: https://github.com/undeadpixel/reinvent-randomized and https://github.com/GLambard/SMILES-X
    Returns a random SMILES given a SMILES of a molecule.
    """
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    if random_type == "unrestricted":
        return Chem.MolToSmiles(
            mol, canonical=False, doRandom=True, isomericSmiles=isomericSmiles
        )
    elif random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
        return Chem.MolToSmiles(
            random_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    elif random_type == "rotated":
        n_atoms = mol.GetNumAtoms()
        rotation_index = random.randint(0, n_atoms - 1)
        atoms = list(range(n_atoms))
        new_atoms_order = (
            atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
        )
        rotated_mol = Chem.RenumberAtoms(mol, new_atoms_order)
        return Chem.MolToSmiles(
            rotated_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    raise ValueError("Type '{}' is not valid".format(random_type))


In [3]:
randomize_smiles("C[N]1C=CC(=N1)N=NC2=CC=CC=C2")


'c1cc(N=Nc2ccn(C)n2)ccc1'

# Creating prompts/training data


In [4]:
# | export
from collections import Counter

import numpy as np
import pandas as pd


In [5]:
# | export


_DEFAULT_ENCODING_DICT = {
    "very small": 0,
    "small": 1,
    "medium": 2,
    "large": 3,
    "very large": 4,
}

_DEFAULT_DECODING_DICT = {v: k for k, v in _DEFAULT_ENCODING_DICT.items()}


def encode_categorical_value(value, encoding_dict=_DEFAULT_ENCODING_DICT):
    try:
        return encoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


def decode_categorical_value(value, decoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return decoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


In [6]:
# | export
ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE = "what is the {property} of {text}###"
ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE = " {value}@@@"


In [7]:
# | export
def create_single_property_forward_prompts(
    df: pd.DataFrame,  # input data
    target: str,  # target property
    target_rename_dict: dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool = True,  # whether to encode the value of the target property categorically
    encoding_dict: dict = _DEFAULT_ENCODING_DICT,  # mapping from numerical categories to string
    prompt_prefix: str = "",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = "string",  # name of the column to use as the representation of the compound
    smiles_augmentation: bool = False,  # whether to augment the SMILES with randomization
    smiles_augmentation_type: str = "rotated",  # the type of randomization to perform
    smiles_augmentation_rounds: int = 10,  # the number of randomizations to perform
    include_canonical_smiles: bool = False,  # whether to include the canonical SMILES when using the augmentation
):
    prompts = []

    if not smiles_augmentation:
        smiles_augmentation_rounds = 1
    for _ in range(smiles_augmentation_rounds):
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            if smiles_augmentation:
                repr = randomize_smiles(repr, random_type=smiles_augmentation_type)
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": row[representation_col],
                    "this_repr": repr,
                }
            )
    if smiles_augmentation and include_canonical_smiles:
        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": repr,
                    "this_repr": repr,
                }
            )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True) # shuffle
    return df


In [8]:
from gpt3forchem.data import get_polymer_data

create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


Unnamed: 0,prompt,completion,repr,this_repr
0,what is the adsorption energy of W-R-B-W-R-A-B...,1@@@,W-R-B-W-R-A-B-A-R-R-W-R-W-W-W-R-W-W-A-W-R-A-R-...,W-R-B-W-R-A-B-A-R-R-W-R-W-W-W-R-W-W-A-W-R-A-R-...
1,what is the adsorption energy of A-A-B-W-B-A-R...,0@@@,A-A-B-W-B-A-R-R-W-B-R-A-R-W-W-R-B-W-B-A-A-R-B-...,A-A-B-W-B-A-R-R-W-B-R-A-R-W-W-R-B-W-B-A-A-R-B-...
2,what is the adsorption energy of R-W-B-B-B-R-A...,2@@@,R-W-B-B-B-R-A-R-B-A-B-R-W-R-W-A-W-A-R-A-B-B-B-...,R-W-B-B-B-R-A-R-B-A-B-R-W-R-W-A-W-A-R-A-B-B-B-...
3,what is the adsorption energy of W-R-A-R-R-A-W...,0@@@,W-R-A-R-R-A-W-B-A-R-R-W-B-R-W-A-R-B-W-A-A-A-A-...,W-R-A-R-R-A-W-B-A-R-R-W-B-R-W-A-R-B-W-A-A-A-A-...
4,what is the adsorption energy of W-R-B-A-R-A-B...,2@@@,W-R-B-A-R-A-B-R-A-B-A-B-R-W-R-A-B-W-W-B-A-W-R-...,W-R-B-A-R-A-B-R-A-B-A-B-R-W-R-A-B-W-W-B-A-W-R-...
...,...,...,...,...
3120,what is the adsorption energy of W-R-W-R-B-W-B...,3@@@,W-R-W-R-B-W-B-A-B-B-W-W-B-R-W-W-A-A-B-R-B-W-B-...,W-R-W-R-B-W-B-A-B-B-W-W-B-R-W-W-A-A-B-R-B-W-B-...
3121,what is the adsorption energy of R-R-R-A-B-R-A...,0@@@,R-R-R-A-B-R-A-R-A-W-A-A-B-A-W-B-W-R-B-R-R-R-R-...,R-R-R-A-B-R-A-R-A-W-A-A-B-A-W-B-W-R-B-R-R-R-R-...
3122,what is the adsorption energy of W-R-B-R-A-W-W...,2@@@,W-R-B-R-A-W-W-R-R-A-B-R-W-B-B-W-R-B-A-R-A-W-R-...,W-R-B-R-A-W-W-R-R-A-B-R-W-B-B-W-R-B-A-R-A-W-R-...
3123,what is the adsorption energy of R-B-R-A-A-R-W...,0@@@,R-B-R-A-A-R-W-A-A-A-W-R-W-B-A-W-A-B-W-B-B-A-A-...,R-B-R-A-A-R-W-A-A-A-W-R-W-B-A-W-A-B-W-B-B-A-A-...


In [9]:
create_single_property_forward_prompts(
    get_polymer_data(),
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the adsorpt...,0@@@,B-B-B-W-A-B-B-R-B-A-R-R-R-B-B-A-A-R-R-R-A-W-B-...,B-B-B-W-A-B-B-R-B-A-R-R-R-B-B-A-A-R-R-R-A-W-B-...
1,you are an expert chemist: what is the adsorpt...,1@@@,B-A-B-W-R-B-W-B-A-R-B-A-B-W-W-R-W-W-W-R-B-A-A-...,B-A-B-W-R-B-W-B-A-R-B-A-B-W-W-R-W-W-W-R-B-A-A-...
2,you are an expert chemist: what is the adsorpt...,2@@@,R-W-A-W-B-A-B-B-R-B-A-A-A-B-B-R-A-A-R-W-W-B-R-...,R-W-A-W-B-A-B-B-R-B-A-A-A-B-B-R-A-A-R-W-W-B-R-...
3,you are an expert chemist: what is the adsorpt...,2@@@,W-B-R-A-B-A-A-R-A-R-A-B-A-W-R-W-A-B-R-A-W-B-W-...,W-B-R-A-B-A-A-R-A-R-A-B-A-W-R-W-A-B-R-A-W-B-W-...
4,you are an expert chemist: what is the adsorpt...,4@@@,A-R-B-A-B-B-R-W-R-A-W-W-A-B-W-R-B-B-B-W-W-A-B-...,A-R-B-A-B-B-R-W-R-A-W-W-A-B-W-R-B-B-B-W-W-A-B-...
...,...,...,...,...
3120,you are an expert chemist: what is the adsorpt...,2@@@,B-W-B-A-A-W-R-B-B-A-A-W-B-A-W-A-R-B-R-A-A-B-A-...,B-W-B-A-A-W-R-B-B-A-A-W-B-A-W-A-R-B-R-A-A-B-A-...
3121,you are an expert chemist: what is the adsorpt...,1@@@,A-A-W-R-R-R-A-R-R-W-R-R-W-B-W-A-R-W-W-B-W-R-B-...,A-A-W-R-R-R-A-R-R-W-R-R-W-B-W-A-R-W-W-B-W-R-B-...
3122,you are an expert chemist: what is the adsorpt...,3@@@,A-W-R-A-R-R-B-R-R-R-B-W-R-W-B-A-B-B-W-W-W-B-R-...,A-W-R-A-R-R-B-R-R-R-B-W-R-W-B-A-B-B-W-W-W-B-R-...
3123,you are an expert chemist: what is the adsorpt...,0@@@,A-W-R-A-R-A-A-R-R-A-R-R-R-R-W-B-W-A-A-R-A-A-A-...,A-W-R-A-R-A-A-R-R-A-R-R-R-R-W-B-W-A-A-R-A-A-A-...


In [10]:
from gpt3forchem.data import get_photoswitch_data

frame_no_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
)
frame_no_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,0@@@,O=C(C1=CC=C(/N=N/C2=NNC=C2)C=C1)OCC,O=C(C1=CC=C(/N=N/C2=NNC=C2)C=C1)OCC
1,you are an expert chemist: what is the transit...,2@@@,OC%26=C%27N=CC=CC%27=C(/N=N/C%28=NC%29=CC=CC=C...,OC%26=C%27N=CC=CC%27=C(/N=N/C%28=NC%29=CC=CC=C...
2,you are an expert chemist: what is the transit...,2@@@,CSc1nnc(s1)N=Nc1c(c2ccccc2)n(c2c1cccc2)C,CSc1nnc(s1)N=Nc1c(c2ccccc2)n(c2c1cccc2)C
3,you are an expert chemist: what is the transit...,0@@@,CC1=C(C(C)=NN1)/N=N/C2=C(F)C=CC=C2,CC1=C(C(C)=NN1)/N=N/C2=C(F)C=CC=C2
4,you are an expert chemist: what is the transit...,1@@@,COC1=CC=C(/N=N/C2=CC=CS2)C=C1,COC1=CC=C(/N=N/C2=CC=CS2)C=C1
...,...,...,...,...
385,you are an expert chemist: what is the transit...,1@@@,CC(NC(C=C1OC)=CC(OC)=C1/N=N/C2=C(OC)C=C(NC(C)=...,CC(NC(C=C1OC)=CC(OC)=C1/N=N/C2=C(OC)C=C(NC(C)=...
386,you are an expert chemist: what is the transit...,1@@@,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC(C)=CC=C2,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC(C)=CC=C2
387,you are an expert chemist: what is the transit...,0@@@,BrC1=CC(/N=N/C2=CC=CC=C2)=CC=C1,BrC1=CC(/N=N/C2=CC=CC=C2)=CC=C1
388,you are an expert chemist: what is the transit...,0@@@,CC(C)C1=C(/N=N/C2=CC=CC=C2)C=NC=C1,CC(C)C1=C(/N=N/C2=CC=CC=C2)C=NC=C1


By using SMILES augmentation, we can create a much larger dataset.


In [11]:
frame_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
)

frame_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,1@@@,CN1C(/N=N/C2=CC=CC=C2)=C(C)C=C1C,c1cccc(/N=N/c2c(C)cc(C)n2C)c1
1,you are an expert chemist: what is the transit...,0@@@,CC1=CC(/N=N/C2=CC(C)=C(C)C=C2)=CC=C1C,N(\c1cc(C)c(C)cc1)=N/c1ccc(C)c(C)c1
2,you are an expert chemist: what is the transit...,2@@@,CN1C2=CC=CC=C2N=C1/N=N/C3=NC4=CC=CC=C4N3C,c12nc(/N=N/c3nc4ccccc4n3C)n(C)c1cccc2
3,you are an expert chemist: what is the transit...,2@@@,CCN(CC)C(C=C%27)=CC=C%27/N=N/C%28=CC=C(N%29CCO...,C(N(CC)c1ccc(/N=N/c2ccc(N3CCOCC3)c(C)c2)cc1)C
4,you are an expert chemist: what is the transit...,1@@@,NC1=CC=C(/N=N/C2=CC=CC=C2)C=C1,N(=N/c1ccccc1)\c1ccc(N)cc1
...,...,...,...,...
3895,you are an expert chemist: what is the transit...,1@@@,OC%17=C(N=CC=C%18)C%18=C(/N=N/C%19=CC=C(Cl)C=C...,c1cc2c(/N=N/c3ccc(Cl)cc3)ccc(O)c2nc1
3896,you are an expert chemist: what is the transit...,3@@@,[H]C1=CC([N+]([O-])=O)=CC(Cl)=C1/N=N/C2=CC([H]...,N(=N/c1ccc(N(CC)CC)cc1)\c1ccc([N+]([O-])=O)cc1Cl
3897,you are an expert chemist: what is the transit...,0@@@,BrC1=CC(/N=N/C2=CC=CC=C2)=CC=C1,c1(/N=N/c2cccc(Br)c2)ccccc1
3898,you are an expert chemist: what is the transit...,0@@@,BrC1=CC=CC=C1/N=N/C2=CC=CC=C2,c1cccc(/N=N/c2c(Br)cccc2)c1


In [12]:
assert len(frame_no_augment) * 10 == len(frame_augment)
assert len(frame_no_augment) == len(frame_augment["repr"].unique())


In [13]:
frame_augment_w_canonical = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
    include_canonical_smiles=True,
)

frame_augment_w_canonical


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,2@@@,CSc1nnc(s1)N=Nc1c(C)[nH]c2c1cccc2,c1(N=Nc2c(C)[nH]c3c2cccc3)sc(SC)nn1
1,you are an expert chemist: what is the transit...,3@@@,[H]C1=CC([N+]([O-])=O)=CC(Cl)=C1/N=N/C2=CC([H]...,c1(/N=N/c2ccc(N(CC)CC)cc2)ccc([N+]([O-])=O)cc1Cl
2,you are an expert chemist: what is the transit...,1@@@,CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2,N(\c1ccc(N(C)C)cc1)=N/c1ccc(C)cc1
3,you are an expert chemist: what is the transit...,0@@@,CC1=NOC(C)=C1/N=N/C2=CC(C(F)(F)F)=CC=C2,N(=N/c1cc(C(F)(F)F)ccc1)\c1c(C)noc1C
4,you are an expert chemist: what is the transit...,0@@@,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,n1(C)cnc(N=Nc2ccccc2)n1
...,...,...,...,...
4285,you are an expert chemist: what is the transit...,1@@@,OC%17=C(N=CC=C%18)C%18=C(/N=N/C%19=CC=C(Cl)C=C...,Clc1ccc(/N=N/c2ccc(O)c3ncccc32)cc1
4286,you are an expert chemist: what is the transit...,3@@@,OC%20=C(N=CC=C%21)C%21=C(/N=N/C%22=CC=C(C(O)=O...,c1c(O)c2ncccc2c(/N=N/c2ccc(C(O)=O)cc2)c1
4287,you are an expert chemist: what is the transit...,2@@@,CCN(CC)C(C=C1)=CC=C1/N=N/C2=CC(C#N)=CC(C#N)=C2,N(=N/c1cc(C#N)cc(C#N)c1)\c1ccc(N(CC)CC)cc1
4288,you are an expert chemist: what is the transit...,0@@@,COC1=CC=CC=C1N=NC2=NC3=CC=C([N+]([O-])=O)C=C3S2,N(=Nc1nc2ccc([N+]([O-])=O)cc2s1)c1c(OC)cccc1


In [14]:
assert len(frame_no_augment) * 10 + len(frame_no_augment) == len(frame_augment_w_canonical)

In [15]:
# | export
def create_single_property_forward_prompts_regression(
    df,  # input data
    target,  # target property
    target_rename_dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    prompt_prefix="",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    num_digit=1,
):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():

        value = f"{round(row[target], num_digit)}"

        prompts.append(
            {
                "prompt": prompt_prefix
                + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                    property=target_name, text=row["string"]
                ),
                "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                    value=value
                ),
            }
        )

    return pd.DataFrame(prompts)


In [16]:
create_single_property_forward_prompts_regression(
    get_polymer_data(),
    "deltaGmin",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the deltaGm...,-7.5@@@
1,you are an expert chemist: what is the deltaGm...,-7.3@@@
2,you are an expert chemist: what is the deltaGm...,-6.4@@@
3,you are an expert chemist: what is the deltaGm...,-6.7@@@
4,you are an expert chemist: what is the deltaGm...,-6.6@@@
...,...,...
3120,you are an expert chemist: what is the deltaGm...,-17.0@@@
3121,you are an expert chemist: what is the deltaGm...,-17.1@@@
3122,you are an expert chemist: what is the deltaGm...,-16.4@@@
3123,you are an expert chemist: what is the deltaGm...,-14.7@@@


## Polymers


Polymer specific prompt generation methods.


In [17]:
# | export
POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT = (
    "what is a polymer with {class_name} {property}?###"
)
POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT = " {text}@@@"

POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION = "what is a polymer with {class_name} {property} and {num_A} A, {num_B} B, {num_W} W, and {num_R} R?###"


In [18]:
# | export
def get_polymer_composition_dict(row):
    composition = Counter(row["string"].split("-"))
    comp_dict = {}
    for key in ["A", "B", "R", "W"]:
        try:
            count = composition[key]
        except KeyError:
            count = 0
        comp_dict[f"num_{key}"] = count
    return comp_dict


## Photoswitches


In [19]:
# | export

PROMPT_TEMPLATE_photoswitch_w_n_pistar = "What is a molecule with a pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"
PROMPT_TEMPLATE_photoswitch_ = (
    "What is a molecule with a pi-pi* transition wavelength of {} nm###"
)
COMPLETION_TEMPLATE_photoswitch_ = "{}@@@"


def generate_inverse_photoswitch_prompts(data: pd.DataFrame) -> pd.DataFrame:
    prompts = []
    completions = []

    for i, row in data.iterrows():
        if np.isnan(row["E isomer n-pi* wavelength in nm"]):
            prompt = PROMPT_TEMPLATE_photoswitch_.format(
                row["E isomer pi-pi* wavelength in nm"]
            )
        else:
            prompt = PROMPT_TEMPLATE_photoswitch_w_n_pistar.format(
                row["E isomer pi-pi* wavelength in nm"],
                row["E isomer n-pi* wavelength in nm"],
            )

        completion = COMPLETION_TEMPLATE_photoswitch_.format(row["SMILES"])
        prompts.append(prompt)
        completions.append(completion)

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions})

    return prompts


In [20]:
from gpt3forchem.data import get_photoswitch_data


In [21]:
photoswitch_data = get_photoswitch_data()


In [22]:
generate_inverse_photoswitch_prompts(photoswitch_data)


Unnamed: 0,prompt,completion
0,What is a molecule with a pi-pi* transition wa...,C[N]1C=CC(=N1)N=NC2=CC=CC=C2@@@
1,What is a molecule with a pi-pi* transition wa...,C[N]1C=NC(=N1)N=NC2=CC=CC=C2@@@
2,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2@@@
3,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C=N1)N=NC2=CC=CC=C2@@@
4,What is a molecule with a pi-pi* transition wa...,C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2@@@
...,...,...
385,What is a molecule with a pi-pi* transition wa...,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
386,What is a molecule with a pi-pi* transition wa...,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
387,What is a molecule with a pi-pi* transition wa...,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
388,What is a molecule with a pi-pi* transition wa...,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...


## MOFs


In [23]:
from gpt3forchem.data import get_mof_data
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List


In [24]:
data = get_mof_data()


  return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


In [25]:
outputs = [c for c in data.columns if "output" in c]


In [26]:
len(data) - data["outputs.CO2-henry_coefficient-mol--kg--Pa"].isna().sum()


170

In [27]:
len(data) - data["outputs.H2O-henry_coefficient-mol--kg--Pa"].isna().sum()


153

In [28]:
len(data) - data["outputs.N2-henry_coefficient-mol--kg--Pa"].isna().sum()


141

Perhaps, let's create a long frame with all the gases.


In [29]:
MOF_REPRESENTATIONS = ["info.mofid.mofid_clean", "chemical_name", "info.qmof_id"]


In [30]:
MOF_OUTPUTS = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.N2-henry_coefficient-mol--kg--Pa",
    "outputs.pbe.bandgap",
]


Let's create some tooling to create prompts with context.

In [31]:
from gpt3forchem.data import gas_features
gas_data = gas_features

We want prompts of the form 

> What is the \<name\> (\<additional data\>) Henry coefficient of <mof>?

In [32]:
# | export


def generate_property_desc(properties, gas_data, gas): 
    if properties is None: 
        return ""
    text = []
    row = gas_data[gas_data["formula"] == gas]
    for prop in properties:
        text.append(f"{prop.replace('_', ' ')} {row[prop].values[0]}")
    
    text = ", ".join(text)
    
    return f"({text})"



In [33]:
generate_property_desc(["accentric_factor", "critical_temperature"], gas_data, "CO2")

'(accentric factor 0.228, critical temperature 304.19)'

In [70]:
# | export
_GAS_CONTEXT_PROMPT_TEMPLATE = "What is the {identifier} {description} Henry cofficient of {repr}###"


def create_prompts_w_gas_context(
    df, gas_data, gases=["CO2", "Xe"], properties=None, identifier=None, regression=False, representation="info.mofid.mofid_clean"
):
    prompts = []

    identifier = "formula" if identifier is None else identifier

    for _, row in df.iterrows():
        for gas in gases:
            subset = gas_data[gas_data["formula"] == gas]
          
            name = subset[identifier].values[0].replace("_", " ")
            if not regression:
                column = subset["related_column"].values[0]
            else:
                raise NotImplementedError("Regression not implemented yet")
            if not pd.isna(row[column]) and not 'nan' in row[column]:
                if properties is None:
                    property_desc = ""
                else:
                    property_desc = generate_property_desc(properties, gas_data, gas)

                prompts.append(
                    {
                        "prompt": _GAS_CONTEXT_PROMPT_TEMPLATE.format(
                            identifier=name,
                            description=property_desc,
                            repr=row[representation],
                        ),
                        "completion": f"{row[column]}@@@",
                        "repr": row[representation],
                    }
                )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True)  # shuffle

    return pd.DataFrame(prompts)

In [71]:
from gpt3forchem.data import get_mof_data, discretize

In [72]:
mof_data = get_mof_data()

features = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
]

for feature in features:
    mof_data[feature + '_log'] = np.log10(mof_data[feature] + 1e-40)

for feature in features:

    discretize(
        mof_data, f"{feature}_log", n_bins=3, labels=["low", "medium", "high"]
    )


  return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


In [73]:
prompts = create_prompts_w_gas_context(mof_data, gas_data, gases=["CO2", "Xe"], properties=["critical_temperature", "accentric_factor"], identifier="formula")

In [74]:
prompts

Unnamed: 0,prompt,completion,repr
0,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=O)...
1,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1ccc(cc1)c1c(C)[n-][nH]c1C.[Zn].dia....
2,"What is the CO2 (critical temperature 304.19, ...",low@@@,[Co].[O-]C(=O)C=Cc1ccncc1.dia.cat0
3,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
4,"What is the Xe (critical temperature 289.74, a...",medium@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
...,...,...,...
249,"What is the CO2 (critical temperature 304.19, ...",low@@@,CC1=NN=N[N]1.[Zn].dia.cat0
250,"What is the CO2 (critical temperature 304.19, ...",low@@@,[C]#N.[Cd].pts.cat0
251,"What is the CO2 (critical temperature 304.19, ...",high@@@,[Ni].n1ccncc1.sql.cat0
252,"What is the CO2 (critical temperature 304.19, ...",high@@@,[Cu].n1ccncc1.sql.cat0


In [75]:
create_prompts_w_gas_context(mof_data, gas_data, gases=["CO2", "Xe"], properties=["formula", "critical_temperature", "accentric_factor"], identifier="name")

Unnamed: 0,prompt,completion,repr
0,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=O)...
1,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1ccc(cc1)c1c(C)[n-][nH]c1C.[Zn].dia....
2,"What is the carbon dioxide (formula CO2, criti...",low@@@,[Co].[O-]C(=O)C=Cc1ccncc1.dia.cat0
3,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
4,"What is the xenon (formula Xe, critical temper...",medium@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
...,...,...,...
249,"What is the carbon dioxide (formula CO2, criti...",low@@@,CC1=NN=N[N]1.[Zn].dia.cat0
250,"What is the carbon dioxide (formula CO2, criti...",low@@@,[C]#N.[Cd].pts.cat0
251,"What is the carbon dioxide (formula CO2, criti...",high@@@,[Ni].n1ccncc1.sql.cat0
252,"What is the carbon dioxide (formula CO2, criti...",high@@@,[Cu].n1ccncc1.sql.cat0
