In [None]:
# | default_exp input


# Data augmentation


We can use SMILES randomization for data augmentation. We use code by [Schwaller et al.](https://github.com/rxn4chemistry/rxn_yields).


In [None]:
# | export
import random
from typing import List

from rdkit import Chem


In [None]:
# | export
def randomize_smiles(
    smiles: str,
    random_type: str = "rotated",  #  The type (unrestricted, restricted, rotated) of randomization performed.
    isomericSmiles: bool = True,
):
    """
    From: https://github.com/undeadpixel/reinvent-randomized and https://github.com/GLambard/SMILES-X
    Returns a random SMILES given a SMILES of a molecule.
    """
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    if random_type == "unrestricted":
        return Chem.MolToSmiles(
            mol, canonical=False, doRandom=True, isomericSmiles=isomericSmiles
        )
    elif random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
        return Chem.MolToSmiles(
            random_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    elif random_type == "rotated":
        n_atoms = mol.GetNumAtoms()
        rotation_index = random.randint(0, n_atoms - 1)
        atoms = list(range(n_atoms))
        new_atoms_order = (
            atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
        )
        rotated_mol = Chem.RenumberAtoms(mol, new_atoms_order)
        return Chem.MolToSmiles(
            rotated_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    raise ValueError("Type '{}' is not valid".format(random_type))


In [None]:
randomize_smiles("C[N]1C=CC(=N1)N=NC2=CC=CC=C2")


'c1ccc(N=Nc2ccn(C)n2)cc1'

# Creating prompts/training data


In [None]:
# | export
from collections import Counter

import numpy as np
import pandas as pd


In [None]:
# | export


_DEFAULT_ENCODING_DICT = {
    "very small": 0,
    "small": 1,
    "medium": 2,
    "large": 3,
    "very large": 4,
}

_DEFAULT_DECODING_DICT = {v: k for k, v in _DEFAULT_ENCODING_DICT.items()}


def encode_categorical_value(value, encoding_dict=_DEFAULT_ENCODING_DICT):
    try:
        return encoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


def decode_categorical_value(value, decoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return decoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


In [None]:
# | export
ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE = "what is the {property} of {text}###"
ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE = " {value}@@@"


In [None]:
# | export
def create_single_property_forward_prompts(
    df: pd.DataFrame,  # input data
    target: str,  # target property
    target_rename_dict: dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool = True,  # whether to encode the value of the target property categorically
    encoding_dict: dict = _DEFAULT_ENCODING_DICT,  # mapping from numerical categories to string
    prompt_prefix: str = "",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = "string",  # name of the column to use as the representation of the compound
    smiles_augmentation: bool = False,  # whether to augment the SMILES with randomization
    smiles_augmentation_type: str = "rotated",  # the type of randomization to perform
    smiles_augmentation_rounds: int = 10,  # the number of randomizations to perform
    include_canonical_smiles: bool = False,  # whether to include the canonical SMILES when using the augmentation
):
    prompts = []

    if not smiles_augmentation:
        smiles_augmentation_rounds = 1
    for _ in range(smiles_augmentation_rounds):
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            if smiles_augmentation:
                repr = randomize_smiles(repr, random_type=smiles_augmentation_type)
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": row[representation_col],
                    "this_repr": repr,
                }
            )
    if smiles_augmentation and include_canonical_smiles:
        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(
                    row[target], encoding_dict=encoding_dict
                )
            else:
                value = row[target]

            repr = row[representation_col]
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                    "repr": repr,
                    "this_repr": repr,
                }
            )

    df = pd.DataFrame(prompts)
    df.dropna(subset=["prompt"], inplace=True)
    df = df.sample(frac=1).reset_index(drop=True) # shuffle
    return df


In [None]:
from gpt3forchem.data import get_polymer_data

create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


Unnamed: 0,prompt,completion,repr,this_repr
0,what is the adsorption energy of R-R-W-B-B-B-R...,2@@@,R-R-W-B-B-B-R-B-B-B-B-W-W-R-B-W-B-R-A-A-R-B-R-...,R-R-W-B-B-B-R-B-B-B-B-W-W-R-B-W-B-R-A-A-R-B-R-...
1,what is the adsorption energy of W-R-W-B-R-R-W...,4@@@,W-R-W-B-R-R-W-W-B-W-W-B-A-A-B-W-W-A-R-A,W-R-W-B-R-R-W-W-B-W-W-B-A-A-B-W-W-A-R-A
2,what is the adsorption energy of B-B-B-W-A-B-B...,0@@@,B-B-B-W-A-B-B-R-B-A-R-R-R-B-B-A-A-R-R-R-A-W-B-...,B-B-B-W-A-B-B-R-B-A-R-R-R-B-B-A-A-R-R-R-A-W-B-...
3,what is the adsorption energy of R-A-A-B-B-B-B...,3@@@,R-A-A-B-B-B-B-R-B-B-W-A-W-B-B-A-A-B-B-W-R-A-R-...,R-A-A-B-B-B-B-R-B-B-W-A-W-B-B-A-A-B-B-W-R-A-R-...
4,what is the adsorption energy of W-B-R-A-R-W-R...,2@@@,W-B-R-A-R-W-R-W-W-B-B-R-A-W-R-R-A-W-B-R-B-B-W-...,W-B-R-A-R-W-R-W-W-B-B-R-A-W-R-R-A-W-B-R-B-B-W-...
...,...,...,...,...
3120,what is the adsorption energy of R-W-A-R-R-A-W...,1@@@,R-W-A-R-R-A-W-W-W-R-A-B-W-B-A-R-B-R-B-A-R-A-W-...,R-W-A-R-R-A-W-W-W-R-A-B-W-B-A-R-B-R-B-A-R-A-W-...
3121,what is the adsorption energy of B-W-R-R-W-A-B...,3@@@,B-W-R-R-W-A-B-W-R-W-B-R-B-A-A-R-B-B-B-B-A-B-B-...,B-W-R-R-W-A-B-W-R-W-B-R-B-A-A-R-B-B-B-B-A-B-B-...
3122,what is the adsorption energy of B-A-B-A-R-R-W...,4@@@,B-A-B-A-R-R-W-A-W-W-B-W-W-B-W-A-A-W-W-A-W-R-W-...,B-A-B-A-R-R-W-A-W-W-B-W-W-B-W-A-A-W-W-A-W-R-W-...
3123,what is the adsorption energy of A-A-B-W-B-R-R...,2@@@,A-A-B-W-B-R-R-B-R-W-R-A-B-B-R-R-B-W-R-B-W-R-W-...,A-A-B-W-B-R-R-B-R-W-R-A-B-B-R-R-B-W-R-B-W-R-W-...


In [None]:
create_single_property_forward_prompts(
    get_polymer_data(),
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the adsorpt...,3@@@,W-A-B-W-W-W-B-A-W-R-R-R-B-R-A-B-B-R-W-R-B-R-B-...,W-A-B-W-W-W-B-A-W-R-R-R-B-R-A-B-B-R-W-R-B-R-B-...
1,you are an expert chemist: what is the adsorpt...,1@@@,W-W-W-A-R-R-A-B-A-R-W-A-B-R-W-R-W-A-A-B-W-A-A-...,W-W-W-A-R-R-A-B-A-R-W-A-B-R-W-R-W-A-A-B-W-A-A-...
2,you are an expert chemist: what is the adsorpt...,2@@@,A-R-R-B-W-A-B-B-A-R-W-W-W-W-B-B-W-W-B-B-A-W-A-...,A-R-R-B-W-A-B-B-A-R-W-W-W-W-B-B-W-W-B-B-A-W-A-...
3,you are an expert chemist: what is the adsorpt...,1@@@,R-R-R-W-A-W-B-W-R-B-B-R-A-A-R-R-B-B-A-A-W-B-R-...,R-R-R-W-A-W-B-W-R-B-B-R-A-A-R-R-B-B-A-A-W-B-R-...
4,you are an expert chemist: what is the adsorpt...,1@@@,B-R-R-W-A-W-R-A-A-W-R-R-W-B-A-R-R-R-R-W-W-R-B-...,B-R-R-W-A-W-R-A-A-W-R-R-W-B-A-R-R-R-R-W-W-R-B-...
...,...,...,...,...
3120,you are an expert chemist: what is the adsorpt...,1@@@,W-R-R-B-R-B-B-A-R-B-W-W-A-W-B-W-B-W-B-B-B-A-W-...,W-R-R-B-R-B-B-A-R-B-W-W-A-W-B-W-B-W-B-B-B-A-W-...
3121,you are an expert chemist: what is the adsorpt...,1@@@,B-R-B-B-W-R-A-A-A-W-R-W-A-W-W-B-R-R-W-A-B-R-R-...,B-R-B-B-W-R-A-A-A-W-R-W-A-W-W-B-R-R-W-A-B-R-R-...
3122,you are an expert chemist: what is the adsorpt...,2@@@,W-W-A-A-B-R-W-A-B-R-A-R-R-B-R-W-W-R-B-R-W-R-R-R,W-W-A-A-B-R-W-A-B-R-A-R-R-B-R-W-W-R-B-R-W-R-R-R
3123,you are an expert chemist: what is the adsorpt...,0@@@,A-R-B-R-R-A-R-W-B-A-R-B-B-W-B-A-A-B-R-B-R-B-W-...,A-R-B-R-R-A-R-W-B-A-R-B-B-W-B-A-A-B-R-B-R-B-W-...


In [None]:
from gpt3forchem.data import get_photoswitch_data

frame_no_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
)
frame_no_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,0@@@,CN(C=N1)C=C1/N=N/C2=CC=CC=C2,CN(C=N1)C=C1/N=N/C2=CC=CC=C2
1,you are an expert chemist: what is the transit...,0@@@,CC1=C(/N=N/C2=CC=CC=C2C#N)C(C)=NO1,CC1=C(/N=N/C2=CC=CC=C2C#N)C(C)=NO1
2,you are an expert chemist: what is the transit...,0@@@,[H]C7=CC=C(N=C(N=NC8=CC=CC(CO)=C8)S9)C9=C7,[H]C7=CC=C(N=C(N=NC8=CC=CC(CO)=C8)S9)C9=C7
3,you are an expert chemist: what is the transit...,2@@@,CC1=C(/N=N/C2=NN=C(CC)S2)C3=CC=CC=C3N1,CC1=C(/N=N/C2=NN=C(CC)S2)C3=CC=CC=C3N1
4,you are an expert chemist: what is the transit...,0@@@,CC1=C(/N=N/C2=CC=C(F)C=C2F)C(C)=NO1,CC1=C(/N=N/C2=CC=C(F)C=C2F)C(C)=NO1
...,...,...,...,...
385,you are an expert chemist: what is the transit...,0@@@,CC1=NOC(C)=C1/N=N/C2=CC(Br)=CC=C2,CC1=NOC(C)=C1/N=N/C2=CC(Br)=CC=C2
386,you are an expert chemist: what is the transit...,1@@@,CC1=CC(/N=N/C2=CC=C(NCCC#N)C=C2)=CC=C1,CC1=CC(/N=N/C2=CC=C(NCCC#N)C=C2)=CC=C1
387,you are an expert chemist: what is the transit...,2@@@,CN(CCC#N)C(C=C%17)=CC=C%17/N=N/C%18=CC=C([N+](...,CN(CCC#N)C(C=C%17)=CC=C%17/N=N/C%18=CC=C([N+](...
388,you are an expert chemist: what is the transit...,1@@@,CC1=C(/N=N/C2=CC=C(N(C)C)C=C2C)C=CC(C)=C1,CC1=C(/N=N/C2=CC=C(N(C)C)C=C2C)C=CC(C)=C1


By using SMILES augmentation, we can create a much larger dataset.


In [None]:
frame_augment = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
)

frame_augment


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,2@@@,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C([N+]([...,N(=N/c1ccc([N+]([O-])=O)cc1)\c1c(C)cc(N(CCC#N)...
1,you are an expert chemist: what is the transit...,0@@@,CC1=NOC(C)=C1/N=N/C2=CC=C(C)C=C2,c1(C)ccc(/N=N/c2c(C)noc2C)cc1
2,you are an expert chemist: what is the transit...,2@@@,CN(CCC#N)C(C=C%17)=CC=C%17/N=N/C%18=CC=C([N+](...,c1cc([N+]([O-])=O)ccc1/N=N/c1ccc(N(C)CCC#N)cc1
3,you are an expert chemist: what is the transit...,2@@@,Cn1c(C)c(c2c1cccc2)N=Nc1nncs1,c1cccc2c(N=Nc3nncs3)c(C)n(C)c12
4,you are an expert chemist: what is the transit...,1@@@,OC%11=C(N=CC=C%12)C%12=C(/N=N/C%13=CC=C(NC(C)=...,N(C(C)=O)c1ccc(/N=N/c2ccc(O)c3ncccc32)cc1
...,...,...,...,...
3895,you are an expert chemist: what is the transit...,1@@@,CC1=NOC(C)=C1/N=N/C2=CC=C(NC(C)=O)C=C2,c1cc(/N=N/c2c(C)noc2C)ccc1NC(C)=O
3896,you are an expert chemist: what is the transit...,2@@@,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C([N+]([...,c1(N(CCC#N)CCO)ccc(/N=N/c2ccc([N+]([O-])=O)cc2...
3897,you are an expert chemist: what is the transit...,1@@@,[H]N(CCC#N)C(C=C7)=CC=C7/N=N/C8=CC=CC=C8,C(CC#N)Nc1ccc(/N=N/c2ccccc2)cc1
3898,you are an expert chemist: what is the transit...,0@@@,CC1=CC=CC=C1/N=N/C2=CC=CC=C2,c1(/N=N/c2c(C)cccc2)ccccc1


In [None]:
assert len(frame_no_augment) * 10 == len(frame_augment)
assert len(frame_no_augment) == len(frame_augment["repr"].unique())


In [None]:
frame_augment_w_canonical = create_single_property_forward_prompts(
    get_photoswitch_data(),
    "wavelength_cat",
    {"wavelength_cat": "transition wavelength"},
    prompt_prefix="you are an expert chemist: ",
    representation_col="SMILES",
    smiles_augmentation=True,
    include_canonical_smiles=True,
)

frame_augment_w_canonical


Unnamed: 0,prompt,completion,repr,this_repr
0,you are an expert chemist: what is the transit...,2@@@,[H]C5=C(N6CCCCC6)C=CC(/N=N/C7=CC=C(N8CCCCC8)C(...,N(=N/c1ccc(N2CCCCC2)cc1)\c1ccc(N2CCCCC2)cc1
1,you are an expert chemist: what is the transit...,1@@@,CC1=CC=C(/N=N/C2=CC=CS2)C=C1,c1cc(/N=N/c2cccs2)ccc1C
2,you are an expert chemist: what is the transit...,2@@@,N#CCCNC(C=C1)=CC=C1/N=N/C2=CC=CC=C2[N+]([O-])=O,N#CCCNc1ccc(/N=N/c2ccccc2[N+]([O-])=O)cc1
3,you are an expert chemist: what is the transit...,0@@@,ClC1=CC=C(/N=N/C2=CC=C(Cl)C=C2)C=C1,Clc1ccc(/N=N/c2ccc(Cl)cc2)cc1
4,you are an expert chemist: what is the transit...,1@@@,CC1=C(/N=N/C2=C(C)C=C(C=C2)N(C)C)C=CC=C1,CN(c1cc(C)c(/N=N/c2ccccc2C)cc1)C
...,...,...,...,...
4285,you are an expert chemist: what is the transit...,3@@@,[H]C3=CC([N+]([O-])=O)=CC(Cl)=C3/N=N/C4=CC([H]...,c1c(OC)c(/N=N/c2ccc([N+]([O-])=O)cc2Cl)ccc1N(C...
4286,you are an expert chemist: what is the transit...,0@@@,O=C(OC)C1=CC=C(/N=N/C2=CC=CC=C2)C=C1,C(OC)(c1ccc(/N=N/c2ccccc2)cc1)=O
4287,you are an expert chemist: what is the transit...,2@@@,BrC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2,Brc1ccc(/N=N/c2ccc(N(C)C)cc2)cc1
4288,you are an expert chemist: what is the transit...,2@@@,CN(C)C1=CC=C(/N=N/C2=CC=C(C(OC)=O)C=C2)C=C1,COC(=O)c1ccc(/N=N/c2ccc(N(C)C)cc2)cc1


In [None]:
assert len(frame_no_augment) * 10 + len(frame_no_augment) == len(frame_augment_w_canonical)

In [None]:
# | export
def create_single_property_forward_prompts_regression(
    df,  # input data
    target,  # target property
    target_rename_dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    prompt_prefix="",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    num_digit=1,
):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():

        value = f"{round(row[target], num_digit)}"

        prompts.append(
            {
                "prompt": prompt_prefix
                + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                    property=target_name, text=row["string"]
                ),
                "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                    value=value
                ),
            }
        )

    return pd.DataFrame(prompts)


In [None]:
create_single_property_forward_prompts_regression(
    get_polymer_data(),
    "deltaGmin",
    {"deltaGmin_cat": "adsorption energy"},
    prompt_prefix="you are an expert chemist: ",
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the deltaGm...,-7.5@@@
1,you are an expert chemist: what is the deltaGm...,-7.3@@@
2,you are an expert chemist: what is the deltaGm...,-6.4@@@
3,you are an expert chemist: what is the deltaGm...,-6.7@@@
4,you are an expert chemist: what is the deltaGm...,-6.6@@@
...,...,...
3120,you are an expert chemist: what is the deltaGm...,-17.0@@@
3121,you are an expert chemist: what is the deltaGm...,-17.1@@@
3122,you are an expert chemist: what is the deltaGm...,-16.4@@@
3123,you are an expert chemist: what is the deltaGm...,-14.7@@@


## Polymers


Polymer specific prompt generation methods.


In [None]:
# | export
POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT = (
    "what is a polymer with {class_name} {property}?###"
)
POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT = " {text}@@@"

POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION = "what is a polymer with {class_name} {property} and {num_A} A, {num_B} B, {num_W} W, and {num_R} R?###"


In [None]:
# | export
def get_polymer_composition_dict(row):
    composition = Counter(row["string"].split("-"))
    comp_dict = {}
    for key in ["A", "B", "R", "W"]:
        try:
            count = composition[key]
        except KeyError:
            count = 0
        comp_dict[f"num_{key}"] = count
    return comp_dict


## Photoswitches


In [None]:
# | export

PROMPT_TEMPLATE_photoswitch_w_n_pistar = "What is a molecule with a pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"
PROMPT_TEMPLATE_photoswitch_ = (
    "What is a molecule with a pi-pi* transition wavelength of {} nm###"
)
COMPLETION_TEMPLATE_photoswitch_ = "{}@@@"


def generate_inverse_photoswitch_prompts(data: pd.DataFrame) -> pd.DataFrame:
    prompts = []
    completions = []

    for i, row in data.iterrows():
        if np.isnan(row["E isomer n-pi* wavelength in nm"]):
            prompt = PROMPT_TEMPLATE_photoswitch_.format(
                row["E isomer pi-pi* wavelength in nm"]
            )
        else:
            prompt = PROMPT_TEMPLATE_photoswitch_w_n_pistar.format(
                row["E isomer pi-pi* wavelength in nm"],
                row["E isomer n-pi* wavelength in nm"],
            )

        completion = COMPLETION_TEMPLATE_photoswitch_.format(row["SMILES"])
        prompts.append(prompt)
        completions.append(completion)

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions})

    return prompts


In [None]:
from gpt3forchem.data import get_photoswitch_data


In [None]:
photoswitch_data = get_photoswitch_data()


In [None]:
generate_inverse_photoswitch_prompts(photoswitch_data)


Unnamed: 0,prompt,completion
0,What is a molecule with a pi-pi* transition wa...,C[N]1C=CC(=N1)N=NC2=CC=CC=C2@@@
1,What is a molecule with a pi-pi* transition wa...,C[N]1C=NC(=N1)N=NC2=CC=CC=C2@@@
2,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2@@@
3,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C=N1)N=NC2=CC=CC=C2@@@
4,What is a molecule with a pi-pi* transition wa...,C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2@@@
...,...,...
385,What is a molecule with a pi-pi* transition wa...,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
386,What is a molecule with a pi-pi* transition wa...,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
387,What is a molecule with a pi-pi* transition wa...,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
388,What is a molecule with a pi-pi* transition wa...,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...


## MOFs


In [None]:
from gpt3forchem.data import get_mof_data
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List


In [None]:
data = get_mof_data()


  return pd.read_csv(os.path.join(datadir, "mof.csv"))


In [None]:
outputs = [c for c in data.columns if "output" in c]


In [None]:
len(data) - data["outputs.CO2-henry_coefficient-mol--kg--Pa"].isna().sum()


170

In [None]:
len(data) - data["outputs.H2O-henry_coefficient-mol--kg--Pa"].isna().sum()


153

In [None]:
len(data) - data["outputs.N2-henry_coefficient-mol--kg--Pa"].isna().sum()


141

Perhaps, let's create a long frame with all the gases.


In [None]:
MOF_REPRESENTATIONS = ["info.mofid.mofid_clean", "chemical_name", "info.qmof_id"]


In [None]:
MOF_OUTPUTS = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.N2-henry_coefficient-mol--kg--Pa",
    "outputs.pbe.bandgap",
]


In [None]:
# | export
def create_single_property_forward_prompts_multiple_targets(
    df: pd.DataFrame,  # input data
    targets: List[str],  # target property
    target_rename_dict: dict,  # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool = True,  # whether to encode the value of the target property categorically
    encoding_dict: dict = _DEFAULT_ENCODING_DICT,  # mapping from numerical categories to string
    prompt_prefix: str = "",  # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = "string",  # name of the column to use as the representation of the compound
):
    prompts = []

    for target in targets:
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if not pd.isna(row[target]):
                if encode_value:
                    value = encode_categorical_value(
                        row[target], encoding_dict=encoding_dict
                    )
                else:
                    value = row[target]

                prompts.append(
                    {
                        "prompt": prompt_prefix
                        + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                            property=target_name, text=row[representation_col]
                        ),
                        "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                            value=value
                        ),
                    }
                )

    return pd.DataFrame(prompts)


NameError: name '_DEFAULT_ENCODING_DICT' is not defined