In [1]:
# | default_exp input


In [21]:
# | export
import random
from typing import List, Dict

import numpy as np
import pandas as pd
from rdkit import Chem
import EFGs

# Data augmentation


We can use SMILES randomization for data augmentation. We use code by [Schwaller et al.](https://github.com/rxn4chemistry/rxn_yields).


In [3]:
# | export
def randomize_smiles(
    smiles: str,
    random_type: str = "rotated",  #  The type (unrestricted, restricted, rotated) of randomization performed.
    isomericSmiles: bool = True,
):
    """
    From: https://github.com/undeadpixel/reinvent-randomized and https://github.com/GLambard/SMILES-X
    Returns a random SMILES given a SMILES of a molecule.
    """
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    if random_type == "unrestricted":
        return Chem.MolToSmiles(
            mol, canonical=False, doRandom=True, isomericSmiles=isomericSmiles
        )
    elif random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
        return Chem.MolToSmiles(
            random_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    elif random_type == "rotated":
        n_atoms = mol.GetNumAtoms()
        rotation_index = random.randint(0, n_atoms - 1)
        atoms = list(range(n_atoms))
        new_atoms_order = (
            atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
        )
        rotated_mol = Chem.RenumberAtoms(mol, new_atoms_order)
        return Chem.MolToSmiles(
            rotated_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    raise ValueError("Type '{}' is not valid".format(random_type))


In [4]:
randomize_smiles("C[N]1C=CC(=N1)N=NC2=CC=CC=C2")


'n1(C)ccc(N=Nc2ccccc2)n1'

# Creating prompts/training data


In [5]:
# | export
from collections import Counter

import numpy as np
import pandas as pd


In [90]:
# | export


_DEFAULT_ENCODING_DICT = {
    "very small": 0,
    "small": 1,
    "medium": 2,
    "large": 3,
    "very large": 4,
}

_DEFAULT_DECODING_DICT = {v: k for k, v in _DEFAULT_ENCODING_DICT.items()}


def encode_categorical_value(value, encoding_dict=_DEFAULT_ENCODING_DICT):
    try:
        return encoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


def decode_categorical_value(value, decoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return decoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


In [91]:
# | export
ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE = "what is the {property} of {text}###"
ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE = " {value}@@@"


In [92]:
# | export
def create_single_property_forward_prompts(
    df: pd.DataFrame,  # input data
    target: str,  # target property
    target_rename_dict: dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool = True,  # whether to encode the value of the target property categorically
    encoding_dict: dict = _DEFAULT_ENCODING_DICT,  # mapping from numerical categories to string
    prompt_prefix: str = "",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = "string",  # name of the column to use as the representation of the compound
    smiles_augmentation: bool = False,  # whether to augment the SMILES with randomization
    smiles_augmentation_type: str = "rotated",  # the type of randomization to perform
    smiles_augmentation_rounds: int = 10,  # the number of randomizations to perform
    include_canonical_smiles: bool = False,  # whether to include the canonical SMILES when using the augmentation
):
    prompts = []

    if not smiles_augmentation:
        smiles_augmentation_rounds = 1
    for _ in range(smiles_augmentation_rounds):
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            if smiles_augmentation:
                repr = randomize_smiles(repr, random_type=smiles_augmentation_type)
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": row[representation_col],
                    "this_repr": repr,
                }
            )
    if smiles_augmentation and include_canonical_smiles:
        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": repr,
                    "this_repr": repr,
                }
            )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True) # shuffle
    return df


In [None]:
from gpt3forchem.data import get_polymer_data

In [93]:
create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


Unnamed: 0,prompt,completion,repr,this_repr
0,what is the adsorption energy of W-W-A-R-B-W-W...,4@@@,W-W-A-R-B-W-W-R-B-B-R-W-R-W-B-B-W-B-W-B-W-B-R-...,W-W-A-R-B-W-W-R-B-B-R-W-R-W-B-B-W-B-W-B-W-B-R-...
1,what is the adsorption energy of R-B-B-B-W-W-B...,4@@@,R-B-B-B-W-W-B-A-A-W-R-A-A-W-B-B-R-R-R-W-B-B-W-...,R-B-B-B-W-W-B-A-A-W-R-A-A-W-B-B-R-R-R-W-B-B-W-...
2,what is the adsorption energy of W-W-A-B-A-W-B...,1@@@,W-W-A-B-A-W-B-A-B-R-R-A-B-A-W-W-R-R-B-R-W-A-B-...,W-W-A-B-A-W-B-A-B-R-R-A-B-A-W-W-R-R-B-R-W-A-B-...
3,what is the adsorption energy of R-B-B-W-W-R-A...,3@@@,R-B-B-W-W-R-A-B-R-R-R-W-B-R-A-B-A-R-R-B-W-W-W-A,R-B-B-W-W-R-A-B-R-R-R-W-B-R-A-B-A-R-R-B-W-W-W-A
4,what is the adsorption energy of B-A-B-R-B-B-A...,4@@@,B-A-B-R-B-B-A-B-A-W-R-R-B-A-W-A-B-A-B-A-A-R-W-W,B-A-B-R-B-B-A-B-A-W-R-R-B-A-W-A-B-A-B-A-A-R-W-W
...,...,...,...,...
3120,what is the adsorption energy of B-W-A-A-B-R-B...,1@@@,B-W-A-A-B-R-B-A-R-R-R-W-W-B-A-R-B-R-R-B-W-B-R-...,B-W-A-A-B-R-B-A-R-R-R-W-W-B-A-R-B-R-R-B-W-B-R-...
3121,what is the adsorption energy of W-W-W-W-A-W-R...,4@@@,W-W-W-W-A-W-R-A-B-W-B-R-B-B-W-R-A-R-B-A-B-R-B-...,W-W-W-W-A-W-R-A-B-W-B-R-B-B-W-R-A-R-B-A-B-R-B-...
3122,what is the adsorption energy of A-R-B-W-W-B-A...,4@@@,A-R-B-W-W-B-A-W-A-A-B-W-R-W-W-W-B-B-W-R-W-A-W-...,A-R-B-W-W-B-A-W-A-A-B-W-R-W-W-W-B-B-W-R-W-A-W-...
3123,what is the adsorption energy of B-B-A-A-W-A-W...,3@@@,B-B-A-A-W-A-W-A-W-R-B-R-B-R-A-A-B-W-A-B-A-B-B-R,B-B-A-A-W-A-W-A-W-R-B-R-B-R-A-A-B-W-A-B-A-B-B-R


In [94]:
create_single_property_forward_prompts(
    get_polymer_data(),
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the adsorpt...,0@@@,A-B-B-R-R-A-R-W-A-A-A-R-R-W-R-B-B-R-R-B-A-W-B-...,A-B-B-R-R-A-R-W-A-A-A-R-R-W-R-B-B-R-R-B-A-W-B-...
1,you are an expert chemist: what is the adsorpt...,1@@@,R-B-W-R-A-R-A-R-R-A-W-R-R-A-B-R-R-W-B-R-W-R-B-R,R-B-W-R-A-R-A-R-R-A-W-R-R-A-B-R-R-W-B-R-W-R-B-R
2,you are an expert chemist: what is the adsorpt...,1@@@,B-R-B-B-B-A-B-A-W-B-W-R-A-A-R-R-B-W-A-R-W-W-B-...,B-R-B-B-B-A-B-A-W-B-W-R-A-A-R-R-B-W-A-R-W-W-B-...
3,you are an expert chemist: what is the adsorpt...,4@@@,W-W-R-B-A-W-B-A-B-W-A-W-R-B-B-B-B-B-W-A-W-A-A-...,W-W-R-B-A-W-B-A-B-W-A-W-R-B-B-B-B-B-W-A-W-A-A-...
4,you are an expert chemist: what is the adsorpt...,3@@@,R-W-R-W-A-R-W-W-W-A-W-A-R-R-R-B-R-W-W-R-W-B-A-...,R-W-R-W-A-R-W-W-W-A-W-A-R-R-R-B-R-W-W-R-W-B-A-...
...,...,...,...,...
3120,you are an expert chemist: what is the adsorpt...,4@@@,R-R-R-B-A-R-B-W-B-B-A-W-B-A-B-W-A-W-B-B-B-B,R-R-R-B-A-R-B-W-B-B-A-W-B-A-B-W-A-W-B-B-B-B
3121,you are an expert chemist: what is the adsorpt...,4@@@,A-R-A-A-B-B-W-W-R-W-W-R-W-W-A-B-B-B-B-A-W-W-R-A,A-R-A-A-B-B-W-W-R-W-W-R-W-W-A-B-B-B-B-A-W-W-R-A
3122,you are an expert chemist: what is the adsorpt...,1@@@,R-B-W-R-B-W-W-W-W-A-R-R-B-A-B-A-W-R-A-B-A-W-B-...,R-B-W-R-B-W-W-W-W-A-R-R-B-A-B-A-W-R-A-B-A-W-B-...
3123,you are an expert chemist: what is the adsorpt...,2@@@,A-A-B-R-R-W-W-R-R-W-B-B-R-B-A-A-B-W-A-B-A-A-R-...,A-A-B-R-R-W-W-R-R-W-B-B-R-B-A-A-B-W-A-B-A-A-R-...


In [97]:
from gpt3forchem.data import get_photoswitch_data

In [95]:
frame_no_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
)
frame_no_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,0@@@,CC1=C(Cl)C=CC=C1/N=N/C2=C(C)C(Cl)=CC=C2,CC1=C(Cl)C=CC=C1/N=N/C2=C(C)C(Cl)=CC=C2
1,you are an expert chemist: what is the transit...,1@@@,Sc1[nH]nc(n1)N=Nc1c(C)n(c2c1cccc2)C,Sc1[nH]nc(n1)N=Nc1c(C)n(c2c1cccc2)C
2,you are an expert chemist: what is the transit...,2@@@,CCN(CC)C(C=C%24)=CC=C%24/N=N/C%25=CC=C(N%26CCC...,CCN(CC)C(C=C%24)=CC=C%24/N=N/C%25=CC=C(N%26CCC...
3,you are an expert chemist: what is the transit...,2@@@,O=[N+]([O-])C1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1,O=[N+]([O-])C1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1
4,you are an expert chemist: what is the transit...,2@@@,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(Cl)C=C2,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(Cl)C=C2
...,...,...,...,...
385,you are an expert chemist: what is the transit...,2@@@,OC%11=C%12N=CC=CC%12=C(/N=N/C%13=NC(CC(OCC)=O)...,OC%11=C%12N=CC=CC%12=C(/N=N/C%13=NC(CC(OCC)=O)...
386,you are an expert chemist: what is the transit...,1@@@,C[N]1N=CC(=C1N=NC2=CC=CC=C2)C,C[N]1N=CC(=C1N=NC2=CC=CC=C2)C
387,you are an expert chemist: what is the transit...,1@@@,ClC(C=C%13)=CC=C%13N=NC%14=NC%15=CC=C([N+]([O-...,ClC(C=C%13)=CC=C%13N=NC%14=NC%15=CC=C([N+]([O-...
388,you are an expert chemist: what is the transit...,1@@@,CC1=CC(N(CCC#N)CCC#N)=CC=C1/N=N/C2=CC=CC=C2,CC1=CC(N(CCC#N)CCC#N)=CC=C1/N=N/C2=CC=CC=C2


By using SMILES augmentation, we can create a much larger dataset.


In [96]:
frame_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
)

frame_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,1@@@,OCCN(CCC#N)C(C=C%13)=CC=C%13/N=N/C%14=CC=CC=C%14,c1(/N=N/c2ccccc2)ccc(N(CCO)CCC#N)cc1
1,you are an expert chemist: what is the transit...,1@@@,[H]N(C)C(C=C1)=CC=C1/N=N/C2=CC=CC=C2,c1ccc(/N=N/c2ccc(NC)cc2)cc1
2,you are an expert chemist: what is the transit...,0@@@,FC(F)(F)C1=CC=C(/N=N/C2=CC=C(C(F)(F)F)C=C2)C=C1,c1(/N=N/c2ccc(C(F)(F)F)cc2)ccc(C(F)(F)F)cc1
3,you are an expert chemist: what is the transit...,2@@@,N#CCCNC(C=C1)=CC=C1/N=N/C2=CC=CC=C2[N+]([O-])=O,c1c([N+]([O-])=O)c(/N=N/c2ccc(NCCC#N)cc2)ccc1
4,you are an expert chemist: what is the transit...,2@@@,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C([N+]([...,OCCN(c1ccc(/N=N/c2ccc([N+]([O-])=O)cc2)c(C)c1)...
...,...,...,...,...
3895,you are an expert chemist: what is the transit...,2@@@,N#CCCNC(C=C1)=CC=C1/N=N/C2=CC=CC([N+]([O-])=O)=C2,c1cc(/N=N/c2cccc([N+]([O-])=O)c2)ccc1NCCC#N
3896,you are an expert chemist: what is the transit...,1@@@,CC(NC(C=C1OC)=CC(OC)=C1/N=N/C2=C(OC)C=C(NC(C)=...,N(C(C)=O)c1cc(OC)c(/N=N/c2c(OC)cc(NC(=O)C)cc2O...
3897,you are an expert chemist: what is the transit...,0@@@,CC1=C(/N=N/C2=CC=CC=C2)C=NC=C1,c1(C)c(/N=N/c2ccccc2)cncc1
3898,you are an expert chemist: what is the transit...,4@@@,[H]C1=C(C=C([H])C(/N=N/C2=C(C#N)C=C([N+]([O-])...,CCN(CC)c1ccc(/N=N/c2c(C#N)cc([N+]([O-])=O)cc2C...


In [12]:
assert len(frame_no_augment) * 10 == len(frame_augment)
assert len(frame_no_augment) == len(frame_augment["repr"].unique())


In [13]:
frame_augment_w_canonical = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
    include_canonical_smiles=True,
)

frame_augment_w_canonical


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,2@@@,CSc1nnc(s1)N=Nc1c(C)[nH]c2c1cccc2,c1(N=Nc2c(C)[nH]c3c2cccc3)sc(SC)nn1
1,you are an expert chemist: what is the transit...,3@@@,[H]C1=CC([N+]([O-])=O)=CC(Cl)=C1/N=N/C2=CC([H]...,c1(/N=N/c2ccc(N(CC)CC)cc2)ccc([N+]([O-])=O)cc1Cl
2,you are an expert chemist: what is the transit...,1@@@,CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2,N(\c1ccc(N(C)C)cc1)=N/c1ccc(C)cc1
3,you are an expert chemist: what is the transit...,0@@@,CC1=NOC(C)=C1/N=N/C2=CC(C(F)(F)F)=CC=C2,N(=N/c1cc(C(F)(F)F)ccc1)\c1c(C)noc1C
4,you are an expert chemist: what is the transit...,0@@@,C[N]1C=NC(=N1)N=NC2=CC=CC=C2,n1(C)cnc(N=Nc2ccccc2)n1
...,...,...,...,...
4285,you are an expert chemist: what is the transit...,1@@@,OC%17=C(N=CC=C%18)C%18=C(/N=N/C%19=CC=C(Cl)C=C...,Clc1ccc(/N=N/c2ccc(O)c3ncccc32)cc1
4286,you are an expert chemist: what is the transit...,3@@@,OC%20=C(N=CC=C%21)C%21=C(/N=N/C%22=CC=C(C(O)=O...,c1c(O)c2ncccc2c(/N=N/c2ccc(C(O)=O)cc2)c1
4287,you are an expert chemist: what is the transit...,2@@@,CCN(CC)C(C=C1)=CC=C1/N=N/C2=CC(C#N)=CC(C#N)=C2,N(=N/c1cc(C#N)cc(C#N)c1)\c1ccc(N(CC)CC)cc1
4288,you are an expert chemist: what is the transit...,0@@@,COC1=CC=CC=C1N=NC2=NC3=CC=C([N+]([O-])=O)C=C3S2,N(=Nc1nc2ccc([N+]([O-])=O)cc2s1)c1c(OC)cccc1


In [14]:
assert len(frame_no_augment) * 10 + len(frame_no_augment) == len(frame_augment_w_canonical)

In [15]:
# | export
def create_single_property_forward_prompts_regression(
    df,  # input data
    target,  # target property
    target_rename_dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    prompt_prefix="",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    num_digit=1,
):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():

        value = f"{round(row[target], num_digit)}"

        prompts.append(
            {
                "prompt": prompt_prefix
                + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                    property=target_name, text=row["string"]
                ),
                "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                    value=value
                ),
            }
        )

    return pd.DataFrame(prompts)


In [16]:
create_single_property_forward_prompts_regression(
    get_polymer_data(),
    "deltaGmin",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the deltaGm...,-7.5@@@
1,you are an expert chemist: what is the deltaGm...,-7.3@@@
2,you are an expert chemist: what is the deltaGm...,-6.4@@@
3,you are an expert chemist: what is the deltaGm...,-6.7@@@
4,you are an expert chemist: what is the deltaGm...,-6.6@@@
...,...,...
3120,you are an expert chemist: what is the deltaGm...,-17.0@@@
3121,you are an expert chemist: what is the deltaGm...,-17.1@@@
3122,you are an expert chemist: what is the deltaGm...,-16.4@@@
3123,you are an expert chemist: what is the deltaGm...,-14.7@@@


## Polymers


Polymer specific prompt generation methods.


In [17]:
# | export
POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT = (
    "what is a polymer with {class_name} {property}?###"
)
POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT = " {text}@@@"

POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION = "what is a polymer with {class_name} {property} and {num_A} A, {num_B} B, {num_W} W, and {num_R} R?###"


In [18]:
# | export
def get_polymer_composition_dict(row):
    composition = Counter(row["string"].split("-"))
    comp_dict = {}
    for key in ["A", "B", "R", "W"]:
        try:
            count = composition[key]
        except KeyError:
            count = 0
        comp_dict[f"num_{key}"] = count
    return comp_dict


In [81]:
# | export 

def create_single_property_inverse_polymer_prompts(df, target, target_rename_dict, encode_value=True, with_composition=True):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():
        if encode_value:
            value = encode_categorical_value(row[target])
        else:
            value = row[target]

        if with_composition:
            comp_dict = get_polymer_composition_dict(row)

            prompt = POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION.format(
                class_name=value, property=target_name, **comp_dict
            )
        else:
            prompt = (
                POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT.format(
                    class_name=value, property=target_name
                ),
            )
        prompts.append(
            {
                "prompt": prompt,
                "completion": POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT.format(
                    text=row["string"]
                ),
            }
        )

    return pd.DataFrame(prompts)


In [82]:
polymer_df = get_polymer_data()

In [84]:
create_single_property_inverse_polymer_prompts(polymer_df, "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}, encode_value=False)

Unnamed: 0,prompt,completion
0,what is a polymer with very large adsorption e...,W-A-B-W-W-A-A-A-R-W-B-B-R-R-B-R@@@
1,what is a polymer with very large adsorption e...,R-W-W-R-R-B-B-B-A-A-A-W-W-A-R-B@@@
2,what is a polymer with very large adsorption e...,A-R-A-W-B-W-A-R-B-W-A-B-B-R-W-R@@@
3,what is a polymer with very large adsorption e...,W-A-R-A-B-B-B-W-A-W-B-R-A-W-R-R@@@
4,what is a polymer with very large adsorption e...,R-R-B-B-W-R-A-W-R-W-A-B-A-A-W-B@@@
...,...,...
3120,what is a polymer with very small adsorption e...,R-W-B-W-W-B-B-B-A-W-W-W-A-W-A-A-W-A-R-R-B-B-A...
3121,what is a polymer with very small adsorption e...,R-A-A-R-A-R-W-B-W-A-R-B-W-R-B-R-W-B-A-B-A-B-A...
3122,what is a polymer with very small adsorption e...,W-W-R-B-W-W-A-W-R-R-A-R-B-A-A-W-R-R-B-B-B-A-R...
3123,what is a polymer with small adsorption energy...,B-A-B-B-R-W-A-R-W-R-B-W-R-W-R-R-W-B-W-A-A-B-A...


## Photoswitches


In [9]:
# | export

PROMPT_TEMPLATE_photoswitch_w_n_pistar = "What is a molecule with a pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"
PROMPT_TEMPLATE_photoswitch_ = (
    "What is a molecule with a pi-pi* transition wavelength of {} nm###"
)
COMPLETION_TEMPLATE_photoswitch_ = "{}@@@"


def generate_inverse_photoswitch_prompts(data: pd.DataFrame) -> pd.DataFrame:
    prompts = []
    completions = []

    for i, row in data.iterrows():
        if np.isnan(row["E isomer n-pi* wavelength in nm"]):
            prompt = PROMPT_TEMPLATE_photoswitch_.format(
                row["E isomer pi-pi* wavelength in nm"]
            )
        else:
            prompt = PROMPT_TEMPLATE_photoswitch_w_n_pistar.format(
                row["E isomer pi-pi* wavelength in nm"],
                row["E isomer n-pi* wavelength in nm"],
            )

        completion = COMPLETION_TEMPLATE_photoswitch_.format(row["SMILES"])
        prompts.append(prompt)
        completions.append(completion)

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions})

    return prompts


In [10]:
from gpt3forchem.data import get_photoswitch_data


In [11]:
photoswitch_data = get_photoswitch_data()


In [12]:
generate_inverse_photoswitch_prompts(photoswitch_data)


Unnamed: 0,prompt,completion
0,What is a molecule with a pi-pi* transition wa...,C[N]1C=CC(=N1)N=NC2=CC=CC=C2@@@
1,What is a molecule with a pi-pi* transition wa...,C[N]1C=NC(=N1)N=NC2=CC=CC=C2@@@
2,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2@@@
3,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C=N1)N=NC2=CC=CC=C2@@@
4,What is a molecule with a pi-pi* transition wa...,C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2@@@
...,...,...
385,What is a molecule with a pi-pi* transition wa...,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
386,What is a molecule with a pi-pi* transition wa...,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
387,What is a molecule with a pi-pi* transition wa...,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
388,What is a molecule with a pi-pi* transition wa...,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...


In [25]:
# | export
FRAGMENT_PROMPT_TEMPlATE = "What is the transition wavelength of a molecule with following fragments {}?###"

def generate_fragment_prompt(data: pd.DataFrame, target: str, regression: bool) -> pd.DataFrame:
    prompts = []
    completions = []
    smiles = []
    for i, row in data.iterrows():
        fragments = EFGs.mol2frag(Chem.MolFromSmiles(row["SMILES"]))[0]

        fragment_string = ", ".join(sorted(fragments, key=lambda x: len(x), reverse=True))

        prompt = FRAGMENT_PROMPT_TEMPlATE.format(fragment_string)
        if regression: 
            value = np.round(row[target],2)
        else: 
            value = row[target]
        completion = COMPLETION_TEMPLATE_photoswitch_.format(value)
        prompts.append(prompt)
        completions.append(completion)
        smiles.append(row["SMILES"])
    prompts = pd.DataFrame({"prompt": prompts, "completion": completions, "smiles": smiles})

    return prompts

In [26]:
fragment_prompts = generate_fragment_prompt(photoswitch_data, "E isomer pi-pi* wavelength in nm", regression=True)

In [27]:
fragment_prompts.iloc[0]['prompt']

'What is the transition wavelength of a molecule with following fragments c1cn[nH]c1, c1ccccc1, N=N?###'

In [None]:
# | export

def generate_one_hot_encoded_fragment_prompt(data: pd.DataFrame, target: str, regression: bool, one_hot_mapper: Dict[str, str]) -> pd.DataFrame:
    prompts = []
    completions = []
    smiles = []

    for i, row in data.iterrows():
        fragments = EFGs.mol2frag(Chem.MolFromSmiles(row["SMILES"]))[0]

        fragment_string = ", ".join(sorted([str(one_hot_mapper[f]) for f in fragments], reverse=True))

        prompt = FRAGMENT_PROMPT_TEMPlATE.format(fragment_string)
        if regression: 
            value = np.round(row[target],2)
        else: 
            value = row[target]
        completion = COMPLETION_TEMPLATE_photoswitch_.format(value)
        prompts.append(prompt)
        completions.append(completion)
        smiles.append(row["SMILES"])

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions, "smiles": smiles})

    return prompts

## MOFs


In [23]:
from gpt3forchem.data import get_mof_data
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List


In [24]:
data = get_mof_data()


  return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


In [25]:
outputs = [c for c in data.columns if "output" in c]


In [26]:
len(data) - data["outputs.CO2-henry_coefficient-mol--kg--Pa"].isna().sum()


170

In [27]:
len(data) - data["outputs.H2O-henry_coefficient-mol--kg--Pa"].isna().sum()


153

In [28]:
len(data) - data["outputs.N2-henry_coefficient-mol--kg--Pa"].isna().sum()


141

Perhaps, let's create a long frame with all the gases.


In [29]:
MOF_REPRESENTATIONS = ["info.mofid.mofid_clean", "chemical_name", "info.qmof_id"]


In [30]:
MOF_OUTPUTS = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.N2-henry_coefficient-mol--kg--Pa",
    "outputs.pbe.bandgap",
]


Let's create some tooling to create prompts with context.

In [98]:
from gpt3forchem.data import gas_features

In [101]:
gas_data = gas_features
gas_features

Unnamed: 0,name,formula,critical_temperature,critical_pressure,accentric_factor,radius,polar,related_column
0,carbon_dioxide,CO2,304.19,7382000,0.228,1.525,False,outputs.CO2-henry_coefficient-mol--kg--Pa_log_cat
1,xenon,Xe,289.74,5840000,0.0,1.985,False,outputs.Xe-henry_coefficient-mol--kg--Pa_log_cat
2,krypton,Kr,209.35,5502000,0.0,1.83,False,outputs.Kr-henry_coefficient-mol--kg--Pa_log_cat
3,hydrogen disulfide,H2S,373.53,8963000,0.0942,1.74,True,outputs.H2S-henry_coefficient-mol--kg--Pa_log_cat
4,water,H2O,647.16,22055000,0.3449,1.58,True,outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat
5,methane,CH4,190.56,4599000,0.012,1.865,False,outputs.CH4-henry_coefficient-mol--kg--Pa_log_cat
6,oxygen,O2,154.58,5043000,0.0222,1.51,False,outputs.O2-henry_coefficient-mol--kg--Pa_log_cat
7,nitrogen,N2,126.2,3460000,0.0377,1.655,False,outputs.N2-henry_coefficient-mol--kg--Pa_log_cat


We want prompts of the form 

> What is the \<name\> (\<additional data\>) Henry coefficient of <mof>?

In [32]:
# | export


def generate_property_desc(properties, gas_data, gas): 
    if properties is None: 
        return ""
    text = []
    row = gas_data[gas_data["formula"] == gas]
    for prop in properties:
        text.append(f"{prop.replace('_', ' ')} {row[prop].values[0]}")
    
    text = ", ".join(text)
    
    return f"({text})"



In [33]:
generate_property_desc(["accentric_factor", "critical_temperature"], gas_data, "CO2")

'(accentric factor 0.228, critical temperature 304.19)'

In [70]:
# | export
_GAS_CONTEXT_PROMPT_TEMPLATE = "What is the {identifier} {description} Henry cofficient of {repr}###"


def create_prompts_w_gas_context(
    df, gas_data, gases=["CO2", "Xe"], properties=None, identifier=None, regression=False, representation="info.mofid.mofid_clean"
):
    prompts = []

    identifier = "formula" if identifier is None else identifier

    for _, row in df.iterrows():
        for gas in gases:
            subset = gas_data[gas_data["formula"] == gas]
          
            name = subset[identifier].values[0].replace("_", " ")
            if not regression:
                column = subset["related_column"].values[0]
            else:
                raise NotImplementedError("Regression not implemented yet")
            if not pd.isna(row[column]) and not 'nan' in row[column]:
                if properties is None:
                    property_desc = ""
                else:
                    property_desc = generate_property_desc(properties, gas_data, gas)

                prompts.append(
                    {
                        "prompt": _GAS_CONTEXT_PROMPT_TEMPLATE.format(
                            identifier=name,
                            description=property_desc,
                            repr=row[representation],
                        ),
                        "completion": f"{row[column]}@@@",
                        "repr": row[representation],
                    }
                )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True)  # shuffle

    return pd.DataFrame(prompts)

In [71]:
from gpt3forchem.data import get_mof_data, discretize

In [72]:
mof_data = get_mof_data()

features = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
]

for feature in features:
    mof_data[feature + '_log'] = np.log10(mof_data[feature] + 1e-40)

for feature in features:

    discretize(
        mof_data, f"{feature}_log", n_bins=3, labels=["low", "medium", "high"]
    )


  return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


In [73]:
prompts = create_prompts_w_gas_context(mof_data, gas_data, gases=["CO2", "Xe"], properties=["critical_temperature", "accentric_factor"], identifier="formula")

In [74]:
prompts

Unnamed: 0,prompt,completion,repr
0,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=O)...
1,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1ccc(cc1)c1c(C)[n-][nH]c1C.[Zn].dia....
2,"What is the CO2 (critical temperature 304.19, ...",low@@@,[Co].[O-]C(=O)C=Cc1ccncc1.dia.cat0
3,"What is the CO2 (critical temperature 304.19, ...",low@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
4,"What is the Xe (critical temperature 289.74, a...",medium@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
...,...,...,...
249,"What is the CO2 (critical temperature 304.19, ...",low@@@,CC1=NN=N[N]1.[Zn].dia.cat0
250,"What is the CO2 (critical temperature 304.19, ...",low@@@,[C]#N.[Cd].pts.cat0
251,"What is the CO2 (critical temperature 304.19, ...",high@@@,[Ni].n1ccncc1.sql.cat0
252,"What is the CO2 (critical temperature 304.19, ...",high@@@,[Cu].n1ccncc1.sql.cat0


In [75]:
create_prompts_w_gas_context(mof_data, gas_data, gases=["CO2", "Xe"], properties=["formula", "critical_temperature", "accentric_factor"], identifier="name")

Unnamed: 0,prompt,completion,repr
0,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=O)...
1,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1ccc(cc1)c1c(C)[n-][nH]c1C.[Zn].dia....
2,"What is the carbon dioxide (formula CO2, criti...",low@@@,[Co].[O-]C(=O)C=Cc1ccncc1.dia.cat0
3,"What is the carbon dioxide (formula CO2, criti...",low@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
4,"What is the xenon (formula Xe, critical temper...",medium@@@,[O-]C(=O)c1ccc(cc1)c1ccc(cc1)C(=O)[O-].[Zn].n1...
...,...,...,...
249,"What is the carbon dioxide (formula CO2, criti...",low@@@,CC1=NN=N[N]1.[Zn].dia.cat0
250,"What is the carbon dioxide (formula CO2, criti...",low@@@,[C]#N.[Cd].pts.cat0
251,"What is the carbon dioxide (formula CO2, criti...",high@@@,[Ni].n1ccncc1.sql.cat0
252,"What is the carbon dioxide (formula CO2, criti...",high@@@,[Cu].n1ccncc1.sql.cat0


## MOF yield

In [46]:
# | export
def create_mof_yield_prompt(row): 
    linkers = row[['linker_1', 'linker_2']]
    linkers = [l for l in linkers if not pd.isna(l)]

    metals = row['core_All_Metals'].split(',')

    solvents = row[['solvent1', 'solvent2', 'solvent3', 'solvent4', 'solvent5']]
    solvents = [s for s in solvents if not pd.isna(s)]
    sol_molratio = row[['sol_molratio1', 'sol_molratio2', 'sol_molratio3', 'sol_molratio4', 'sol_molratio5']]
    sol_molratio = [s for s in sol_molratio if not pd.isna(s)]
    additives = row[['additive1', 'additive2', 'additive3', 'additive4', 'additive5']]
    additives = [a for a in additives if not pd.isna(a)]

    temperature = row['temperature_Celsius']
    time = row['time_h']

    start = 'What is the yield of the reaction of the metal' 
    if len(metals) > 1:
        start += 's '
    else: 
        start += ' '

    start += ', '.join(metals) + ' with the linker' 
    if len(linkers) > 1:
        start += 's '
    else:
        start += ' '
    
    start += ', '.join(linkers) 
    
    # for the solvents, combine their names and molratios
    solvents = [f"{np.round(r,2)} {s}" for s, r in zip(solvents, sol_molratio)]

    start += ' in the solvents ' + ', '.join(solvents) + f' at {temperature} Celsius for {time} hours'

    # add the additives
    if len(additives) > 0:
        start += ' with the additive' 
        if len(additives) > 1:
            start += 's '
        else:
            start += ' '
        start += ', '.join(additives) + '?'
    else:
        start += '?'
        
    return start
    

In [43]:
from gpt3forchem.data import get_mof_yield_data

In [44]:
create_mof_yield_prompt(get_mof_yield_data().iloc[0])

'What is the yield of the reaction of the metal Mn with the linker [O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)O)C(=O)O)cc(c1)C(=O)[O-] in the solvents 0.53 CN(C)C=O, 0.18 C(CO)O, 0.28 O at 55 Celsius for 48 hours with the additive Cl?'

In [45]:
create_mof_yield_prompt(get_mof_yield_data().iloc[1])

'What is the yield of the reaction of the metal Cu with the linker O=C(c1cncc(c1)C(=O)Nc1ccc2c(c1)ccc(c2)C(=O)[NH2]c1cc(cc(c1)C(=O)[O-])C(=O)[O-])Nc1ccc2c(c1)ccc(c2)C(=O)[NH2]c1cc(cc(c1)C(=O)[O-])C(=O)[O-] in the solvents 0.54 CN(C)C=O, 0.46 O at 55 Celsius for 120 hours with the additive Cl?'

In [58]:
# | export
def get_mof_yield_prompt_completions(dataframe, yield_column: str = "yield"): 
    rows = []

    for i, row in dataframe.iterrows(): 
        prompt = create_mof_yield_prompt(row)
        completion = f"{int(row[yield_column])}@@@"
        rows.append({
            'prompt': prompt,
            'completion': completion,
            'repr': row['basename']
        })
    
    return pd.DataFrame(rows)

In [59]:
get_mof_yield_prompt_completions(get_mof_yield_data().iloc[:10])

Unnamed: 0,prompt,completion,repr
0,What is the yield of the reaction of the metal...,73@@@,OFODET
1,What is the yield of the reaction of the metal...,50@@@,XAVKIR
2,What is the yield of the reaction of the metal...,68@@@,LATPIG
3,What is the yield of the reaction of the metal...,80@@@,MOYYIJ
4,What is the yield of the reaction of the metal...,61@@@,OFOCUI
5,What is the yield of the reaction of the metal...,42@@@,OPENUT
6,What is the yield of the reaction of the metal...,45@@@,INOVEN
7,What is the yield of the reaction of the metal...,76@@@,YEMJAC
8,What is the yield of the reaction of the metal...,59@@@,QUQFIS
9,What is the yield of the reaction of the metal...,57@@@,QUQFOY


## USPTO yields

In [11]:
# | export 

_WITHOUT_REACTION_SMILES_TEMPLATE = """What is the yield of the reaction with the following description: {description}###"""
_WITH_REACTION_SMILES_TEMPLATE = """What is the yield of the reaction {reaction_smiles} with the following description: {description}###"""
_ONLY_REACTION_SMILES_TEMPLATE = """What is the yield of the reaction {reaction_smiles}###"""
def create_reaction_yield_prompts(data, include_reaction_smiles: bool = False, only_reaction_smiles: bool = False): 
    prompts = []

    for i, row in data.iterrows(): 
        try:
            method = row['paragraph_without_yield_and_charac']
            yield_ = int(row['yield'])
            reaction_smiles = row['reaction_smiles']
            if include_reaction_smiles: 
                prompt = _WITH_REACTION_SMILES_TEMPLATE.format(reaction_smiles=reaction_smiles, description=method)
            elif only_reaction_smiles:
                prompt = _ONLY_REACTION_SMILES_TEMPLATE.format(reaction_smiles=reaction_smiles)
            else:
                prompt = _WITHOUT_REACTION_SMILES_TEMPLATE.format(description=method)
            prompts.append({
                'prompt': prompt,
                'completion': f"{yield_}@@@",
                'repr': row['reaction_smiles']
            })
        except Exception: 
            pass
    
    return pd.DataFrame(prompts)
        

In [12]:
from gpt3forchem.data import get_uspto_yield_data

In [13]:
data = get_uspto_yield_data()
prompts = create_reaction_yield_prompts(data, include_reaction_smiles=True)

In [8]:
prompts

Unnamed: 0,prompt,completion,repr
0,What is the yield of the reaction [C:1]([O:5...,86@@@,[C:1]([O:5][C:6](=[O:19])[NH:7][C:8]1[C:13]([F...
1,What is the yield of the reaction [NH3:1].CO...,65@@@,[NH3:1].CO.C([O:6][C:7]([C:9]1[CH:14]=[C:13]([...
2,What is the yield of the reaction [CH2:1]([C...,88@@@,[CH2:1]([C:7]1([CH2:24][CH2:25][CH2:26][CH2:27...
3,What is the yield of the reaction [CH3:1][C:...,91@@@,[CH3:1][C:2]1[CH2:3][C:4]2[C:5]([CH:19]=1)=[CH...
4,What is the yield of the reaction [CH3:1][C:...,92@@@,[CH3:1][C:2]1[CH2:3][C:4]2[C:5]([CH:45]=1)=[CH...
...,...,...,...
1550,What is the yield of the reaction [CH3:1][O:...,89@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1551,What is the yield of the reaction [CH3:1][O:...,72@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1552,What is the yield of the reaction [CH3:1][O:...,34@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1553,What is the yield of the reaction [H-].[Na+]...,40@@@,[H-].[Na+].[CH3:3][O:4][C:5]1[CH:12]=[CH:11][C...


In [9]:
prompts = create_reaction_yield_prompts(data, include_reaction_smiles=False)

In [10]:
prompts

Unnamed: 0,prompt,completion,repr
0,What is the yield of the reaction with the fol...,86@@@,[C:1]([O:5][C:6](=[O:19])[NH:7][C:8]1[C:13]([F...
1,What is the yield of the reaction with the fol...,65@@@,[NH3:1].CO.C([O:6][C:7]([C:9]1[CH:14]=[C:13]([...
2,What is the yield of the reaction with the fol...,88@@@,[CH2:1]([C:7]1([CH2:24][CH2:25][CH2:26][CH2:27...
3,What is the yield of the reaction with the fol...,91@@@,[CH3:1][C:2]1[CH2:3][C:4]2[C:5]([CH:19]=1)=[CH...
4,What is the yield of the reaction with the fol...,92@@@,[CH3:1][C:2]1[CH2:3][C:4]2[C:5]([CH:45]=1)=[CH...
...,...,...,...
1550,What is the yield of the reaction with the fol...,89@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1551,What is the yield of the reaction with the fol...,72@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1552,What is the yield of the reaction with the fol...,34@@@,[CH3:1][O:2][C:3]1[CH:8]=[CH:7][C:6]([CH2:9][C...
1553,What is the yield of the reaction with the fol...,40@@@,[H-].[Na+].[CH3:3][O:4][C:5]1[CH:12]=[CH:11][C...


## Solubility

In [1]:
# | export
_SOLUBILITY_PROMPT_TEMPLATE = "What is the solubility of {repr}###"
_SOLUBILITY_FEATURES =['MolWt', 'MolLogP', 'MolMR', 'HeavyAtomCount',
       'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds',
       'NumValenceElectrons', 'NumAromaticRings', 'NumSaturatedRings',
       'NumAliphaticRings', 'RingCount', 'TPSA', 'LabuteASA', 'BalabanJ',
       'BertzCT']

def create_prompts_solubility(
    df, regression=False, representation="SMILES"
):
    prompts = []

    for _, row in df.iterrows():
        if representation=='features':
            identifier = " ".join([str(np.round(row[feature], 2)) for feature in _SOLUBILITY_FEATURES])
        else:
            identifier = row[representation]

        solubility = np.round(solubility,2) if regression else row['Solubility_cat']
        
        prompts.append(
            {
                "prompt": _SOLUBILITY_PROMPT_TEMPLATE.format(
                    repr=identifier,
                ),
                "completion": f"{solubility}@@@",
                "repr": identifier,
                'smiles': row['SMILES']
            }
        )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True)  # shuffle

    return pd.DataFrame(prompts)

In [18]:
from  gpt3forchem.data import get_solubility_data

In [19]:
data = get_solubility_data()

In [33]:
prompts = create_prompts_solubility(data, regression=True, representation='SMILES')

In [34]:
prompts

Unnamed: 0,prompt,completion,repr,smiles
0,What is the solubility of [Br-].CCCCCCCCCCCCCC...,-3.62@@@,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,What is the solubility of O=C1Nc2cccc3cccc1c23###,-3.25@@@,O=C1Nc2cccc3cccc1c23,O=C1Nc2cccc3cccc1c23
2,What is the solubility of Clc1ccc(C=O)cc1###,-2.18@@@,Clc1ccc(C=O)cc1,Clc1ccc(C=O)cc1
3,What is the solubility of [Zn++].CC(c1ccccc1)c...,-3.92@@@,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...
4,What is the solubility of C1OC1CN(CC2CO2)c3ccc...,-4.66@@@,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...
...,...,...,...,...
9916,What is the solubility of C(c1ccc(cc1)NCCCC)(=...,-3.01@@@,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C
9917,What is the solubility of OC1=C(C(C2=C(O)[C@@]...,-2.93@@@,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...
9918,What is the solubility of c1(cc(ccc1C(C)C)C)O###,-2.19@@@,c1(cc(ccc1C(C)C)C)O,c1(cc(ccc1C(C)C)C)O
9919,What is the solubility of COc1ccc(CCN(C)CCCC(C...,-3.98@@@,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...


In [35]:
create_prompts_solubility(data, regression=True, representation='InChI')

Unnamed: 0,prompt,completion,repr,smiles
0,What is the solubility of InChI=1S/C21H46N.BrH...,-3.62@@@,InChI=1S/C21H46N.BrH/c1-5-6-7-8-9-10-11-12-13-...,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,What is the solubility of InChI=1S/C11H7NO/c13...,-3.25@@@,InChI=1S/C11H7NO/c13-11-8-5-1-3-7-4-2-6-9(12-1...,O=C1Nc2cccc3cccc1c23
2,What is the solubility of InChI=1S/C7H5ClO/c8-...,-2.18@@@,InChI=1S/C7H5ClO/c8-7-3-1-6(5-9)2-4-7/h1-5H,Clc1ccc(C=O)cc1
3,What is the solubility of InChI=1S/2C23H22O3.Z...,-3.92@@@,InChI=1S/2C23H22O3.Zn/c2*1-15(17-9-5-3-6-10-17...,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...
4,What is the solubility of InChI=1S/C25H30N2O4/...,-4.66@@@,InChI=1S/C25H30N2O4/c1-5-20(26(10-22-14-28-22)...,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...
...,...,...,...,...
9916,What is the solubility of InChI=1S/C15H24N2O2/...,-3.01@@@,InChI=1S/C15H24N2O2/c1-4-5-10-16-14-8-6-13(7-9...,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C
9917,What is the solubility of InChI=1S/C22H24N2O8/...,-2.93@@@,InChI=1S/C22H24N2O8/c1-21(31)8-5-4-6-11(25)12(...,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...
9918,What is the solubility of InChI=1S/C10H14O/c1-...,-2.19@@@,InChI=1S/C10H14O/c1-7(2)9-5-4-8(3)6-10(9)11/h4...,c1(cc(ccc1C(C)C)C)O
9919,What is the solubility of InChI=1S/C27H38N2O4/...,-3.98@@@,"InChI=1S/C27H38N2O4/c1-20(2)27(19-28,22-10-12-...",COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...


In [36]:
create_prompts_solubility(data, regression=True, representation='Name')

Unnamed: 0,prompt,completion,repr,smiles
0,"What is the solubility of N,N,N-trimethyloctad...",-3.62@@@,"N,N,N-trimethyloctadecan-1-aminium bromide",[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,What is the solubility of Benzo[cd]indol-2(1H)...,-3.25@@@,Benzo[cd]indol-2(1H)-one,O=C1Nc2cccc3cccc1c23
2,What is the solubility of 4-chlorobenzaldehyde###,-2.18@@@,4-chlorobenzaldehyde,Clc1ccc(C=O)cc1
3,What is the solubility of zinc bis[2-hydroxy-3...,-3.92@@@,"zinc bis[2-hydroxy-3,5-bis(1-phenylethyl)benzo...",[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...
4,What is the solubility of 4-({4-[bis(oxiran-2-...,-4.66@@@,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...
...,...,...,...,...
9916,What is the solubility of tetracaine###,-3.01@@@,tetracaine,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C
9917,What is the solubility of tetracycline###,-2.93@@@,tetracycline,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...
9918,What is the solubility of thymol###,-2.19@@@,thymol,c1(cc(ccc1C(C)C)C)O
9919,What is the solubility of verapamil###,-3.98@@@,verapamil,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...


In [37]:
create_prompts_solubility(data, regression=True, representation='features')

Unnamed: 0,prompt,completion,repr,smiles
0,What is the solubility of 392.51 3.96 102.45 2...,-3.62@@@,392.51 3.96 102.45 23.0 0.0 0.0 2.0 17.0 142.0...,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C
1,What is the solubility of 169.18 2.41 51.9 13....,-3.25@@@,169.18 2.41 51.9 13.0 1.0 1.0 2.0 0.0 62.0 2.0...,O=C1Nc2cccc3cccc1c23
2,What is the solubility of 140.57 2.15 36.84 9....,-2.18@@@,140.57 2.15 36.84 9.0 1.0 0.0 2.0 1.0 46.0 1.0...,Clc1ccc(C=O)cc1
3,What is the solubility of 756.23 8.12 200.71 5...,-3.92@@@,756.23 8.12 200.71 53.0 6.0 2.0 7.0 10.0 264.0...,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...
4,What is the solubility of 422.53 2.49 119.08 3...,-4.66@@@,422.53 2.49 119.08 31.0 6.0 0.0 6.0 12.0 164.0...,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...
...,...,...,...,...
9916,What is the solubility of 264.37 2.62 78.68 19...,-3.01@@@,264.37 2.62 78.68 19.0 4.0 1.0 4.0 8.0 106.0 1...,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C
9917,What is the solubility of 444.44 -0.21 109.54 ...,-2.93@@@,444.44 -0.21 109.54 32.0 9.0 6.0 10.0 2.0 170....,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...
9918,What is the solubility of 150.22 2.82 46.93 11...,-2.19@@@,150.22 2.82 46.93 11.0 1.0 1.0 1.0 1.0 60.0 1....,c1(cc(ccc1C(C)C)C)O
9919,What is the solubility of 454.61 5.09 131.66 3...,-3.98@@@,454.61 5.09 131.66 33.0 6.0 0.0 6.0 13.0 180.0...,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...
