In [None]:
# | default_exp input

# Data augmentation

We can use SMILES randomization for data augmentation. We use code by [Schwaller et al.](https://github.com/rxn4chemistry/rxn_yields).

In [26]:
from rdkit import Chem
import random

In [None]:
def randomize_smiles(
    smiles: str, 
    random_type: str = "rotated",  #  The type (unrestricted, restricted, rotated) of randomization performed.
    isomericSmiles: bool = True
):
    """
    From: https://github.com/undeadpixel/reinvent-randomized and https://github.com/GLambard/SMILES-X
    Returns a random SMILES given a SMILES of a molecule.
    """
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None

    if random_type == "unrestricted":
        return Chem.MolToSmiles(
            mol, canonical=False, doRandom=True, isomericSmiles=isomericSmiles
        )
    elif random_type == "restricted":
        new_atom_order = list(range(mol.GetNumAtoms()))
        random.shuffle(new_atom_order)
        random_mol = Chem.RenumberAtoms(mol, newOrder=new_atom_order)
        return Chem.MolToSmiles(
            random_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    elif random_type == "rotated":
        n_atoms = mol.GetNumAtoms()
        rotation_index = random.randint(0, n_atoms - 1)
        atoms = list(range(n_atoms))
        new_atoms_order = (
            atoms[rotation_index % len(atoms) :] + atoms[: rotation_index % len(atoms)]
        )
        rotated_mol = Chem.RenumberAtoms(mol, new_atoms_order)
        return Chem.MolToSmiles(
            rotated_mol, canonical=False, isomericSmiles=isomericSmiles
        )
    raise ValueError("Type '{}' is not valid".format(random_type))


# Creating prompts/training data


In [None]:
# | export
from collections import Counter

import numpy as np
import pandas as pd

In [None]:
# | export


_DEFAULT_ENCODING_DICT = {
    "very small": 0,
    "small": 1,
    "medium": 2,
    "large": 3,
    "very large": 4,
}

_DEFAULT_DECODING_DICT = {v: k for k, v in _DEFAULT_ENCODING_DICT.items()}


def encode_categorical_value(value, encoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return encoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


def decode_categorical_value(value, decoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return decoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


In [None]:
# | export
ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE = "what is the {property} of {text}###"
ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE = " {value}@@@"


In [None]:
# | export
def create_single_property_forward_prompts(
    df: pd.DataFrame, # input data
    target: str, # target property
    target_rename_dict: dict, # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool=True, # whether to encode the value of the target property categorically
    encoding_dict: dict=_DEFAULT_ENCODING_DICT, # mapping from numerical categories to string
    prompt_prefix: str="", # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = 'string', # name of the column to use as the representation of the compound
    smiles_augmentation: bool = False, # whether to augment the SMILES with randomization
    smiles_augmentation_type: str = "rotated", # the type of randomization to perform
    smiles_augmentation_rounds: int = 10, # the number of randomizations to perform
):
    prompts = []

    if not smiles_augmentation: 
        smiles_augmentation_rounds= 1
    for _ in range(smiles_augmentation_rounds):
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(row[target], encoding_dict=encoding_dict)
            else:
                value = row[target]

            repr = row[representation_col]
            if smiles_augmentation:
                repr = randomize_smiles(repr, random_type=smiles_augmentation_type)
            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=repr
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                }
            )

    df = pd.DataFrame(prompts)
    df.dropna(subset=['prompt'], inplace=True)
    return df


In [None]:
from gpt3forchem.data import get_polymer_data

create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


Unnamed: 0,prompt,completion
0,what is the adsorption energy of W-A-B-W-W-A-A...,4@@@
1,what is the adsorption energy of R-W-W-R-R-B-B...,4@@@
2,what is the adsorption energy of A-R-A-W-B-W-A...,4@@@
3,what is the adsorption energy of W-A-R-A-B-B-B...,4@@@
4,what is the adsorption energy of R-R-B-B-W-R-A...,4@@@
...,...,...
3120,what is the adsorption energy of R-W-B-W-W-B-B...,0@@@
3121,what is the adsorption energy of R-A-A-R-A-R-W...,0@@@
3122,what is the adsorption energy of W-W-R-B-W-W-A...,0@@@
3123,what is the adsorption energy of B-A-B-B-R-W-A...,1@@@


In [None]:
create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}, prompt_prefix='you are an expert chemist: '
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the adsorpt...,4@@@
1,you are an expert chemist: what is the adsorpt...,4@@@
2,you are an expert chemist: what is the adsorpt...,4@@@
3,you are an expert chemist: what is the adsorpt...,4@@@
4,you are an expert chemist: what is the adsorpt...,4@@@
...,...,...
3120,you are an expert chemist: what is the adsorpt...,0@@@
3121,you are an expert chemist: what is the adsorpt...,0@@@
3122,you are an expert chemist: what is the adsorpt...,0@@@
3123,you are an expert chemist: what is the adsorpt...,1@@@


In [None]:
from gpt3forchem.data import get_photoswitch_data

create_single_property_forward_prompts(
    get_photoswitch_data(), "wavelength_cat", {"wavelength_cat": "transition wavelength"}, prompt_prefix='you are an expert chemist: ', representation_col='SMILES'
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the transit...,0@@@
1,you are an expert chemist: what is the transit...,0@@@
2,you are an expert chemist: what is the transit...,0@@@
3,you are an expert chemist: what is the transit...,0@@@
4,you are an expert chemist: what is the transit...,0@@@
...,...,...
387,you are an expert chemist: what is the transit...,2@@@
388,you are an expert chemist: what is the transit...,2@@@
389,you are an expert chemist: what is the transit...,3@@@
390,you are an expert chemist: what is the transit...,3@@@


In [None]:
# | export
def create_single_property_forward_prompts_regression(
    df, # input data
    target, # target property
    target_rename_dict, # dict to rename target property from the column name in df to the target property name in the prompt
    prompt_prefix="", # prefix to add to the prompt, e.g. "I am an expert chemist"
    num_digit=1,
):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():

        value = f"{round(row[target], num_digit)}"

        prompts.append(
            {
                "prompt": prompt_prefix
                + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                    property=target_name, text=row["string"]
                ),
                "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                    value=value
                ),
            }
        )

    return pd.DataFrame(prompts)


In [None]:
create_single_property_forward_prompts_regression(
    get_polymer_data(), "deltaGmin", {"deltaGmin_cat": "adsorption energy"}, prompt_prefix='you are an expert chemist: '
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the deltaGm...,-7.5@@@
1,you are an expert chemist: what is the deltaGm...,-7.3@@@
2,you are an expert chemist: what is the deltaGm...,-6.4@@@
3,you are an expert chemist: what is the deltaGm...,-6.7@@@
4,you are an expert chemist: what is the deltaGm...,-6.6@@@
...,...,...
3120,you are an expert chemist: what is the deltaGm...,-17.0@@@
3121,you are an expert chemist: what is the deltaGm...,-17.1@@@
3122,you are an expert chemist: what is the deltaGm...,-16.4@@@
3123,you are an expert chemist: what is the deltaGm...,-14.7@@@


## Polymers


Polymer specific prompt generation methods.

In [None]:
# | export
POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT = (
    "what is a polymer with {class_name} {property}?###"
)
POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT = " {text}@@@"

POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION = "what is a polymer with {class_name} {property} and {num_A} A, {num_B} B, {num_W} W, and {num_R} R?###"


In [None]:
# | export
def get_polymer_composition_dict(row):
    composition = Counter(row["string"].split("-"))
    comp_dict = {}
    for key in ["A", "B", "R", "W"]:
        try:
            count = composition[key]
        except KeyError:
            count = 0
        comp_dict[f"num_{key}"] = count
    return comp_dict


## Photoswitches

In [None]:
# | export 

PROMPT_TEMPLATE_photoswitch_w_n_pistar = "What is a molecule with a pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"
PROMPT_TEMPLATE_photoswitch_ = "What is a molecule with a pi-pi* transition wavelength of {} nm###"
COMPLETION_TEMPLATE_photoswitch_ = "{}@@@"


def generate_inverse_photoswitch_prompts(data: pd.DataFrame) -> pd.DataFrame:
    prompts = []
    completions = []

    for i, row in data.iterrows():
        if np.isnan(row["E isomer n-pi* wavelength in nm"]):
            prompt = PROMPT_TEMPLATE_photoswitch_.format(row["E isomer pi-pi* wavelength in nm"])
        else:
            prompt = PROMPT_TEMPLATE_photoswitch_w_n_pistar.format(
                row["E isomer pi-pi* wavelength in nm"],
                row["E isomer n-pi* wavelength in nm"],
            )

        completion = COMPLETION_TEMPLATE_photoswitch_.format(row["SMILES"])
        prompts.append(prompt)
        completions.append(completion)

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions})

    return prompts


In [None]:
from gpt3forchem.data import get_photoswitch_data

In [None]:
photoswitch_data = get_photoswitch_data()

In [None]:
generate_inverse_photoswitch_prompts(photoswitch_data)

Unnamed: 0,prompt,completion
0,What is a molecule with a pi-pi* transition wa...,C[N]1C=CC(=N1)N=NC2=CC=CC=C2@@@
1,What is a molecule with a pi-pi* transition wa...,C[N]1C=NC(=N1)N=NC2=CC=CC=C2@@@
2,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2@@@
3,What is a molecule with a pi-pi* transition wa...,C[N]1C=C(C=N1)N=NC2=CC=CC=C2@@@
4,What is a molecule with a pi-pi* transition wa...,C[N]1N=C(C)C(=C1C)N=NC2=CC=CC=C2@@@
...,...,...
385,What is a molecule with a pi-pi* transition wa...,OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
386,What is a molecule with a pi-pi* transition wa...,OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
387,What is a molecule with a pi-pi* transition wa...,N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
388,What is a molecule with a pi-pi* transition wa...,N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...


## MOFs

In [None]:
from gpt3forchem.data import get_mof_data
import pandas as pd
from sklearn.model_selection import train_test_split
from typing import List

In [None]:
data = get_mof_data()

  return pd.read_csv(os.path.join(datadir, "mof.csv"))


In [None]:
outputs = [c for c in data.columns if "output" in c]

In [None]:
len(data) - data['outputs.CO2-henry_coefficient-mol--kg--Pa'].isna().sum()

170

In [None]:
len(data) - data['outputs.H2O-henry_coefficient-mol--kg--Pa'].isna().sum()

153

In [None]:
len(data) - data['outputs.N2-henry_coefficient-mol--kg--Pa'].isna().sum()

141

Perhaps, let's create a long frame with all the gases.

In [None]:
MOF_REPRESENTATIONS = [
   'info.mofid.mofid_clean',
   'chemical_name',
   'info.qmof_id'
]

In [None]:
MOF_OUTPUTS = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.N2-henry_coefficient-mol--kg--Pa",
    "outputs.pbe.bandgap",
]


In [None]:
# | export
def create_single_property_forward_prompts_multiple_targets(
    df: pd.DataFrame, # input data
    targets: List[str], # target property
    target_rename_dict: dict, # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value: bool=True, # whether to encode the value of the target property categorically
    encoding_dict: dict=_DEFAULT_ENCODING_DICT, # mapping from numerical categories to string
    prompt_prefix: str="", # prefix to add to the prompt, e.g. "I am an expert chemist"
    representation_col: str = 'string' # name of the column to use as the representation of the compound
):
    prompts = []

    for target in targets:
        target_name = target
        for key, value in target_rename_dict.items():
            target_name = target_name.replace(key, value)

        for _, row in df.iterrows():
            if encode_value:
                value = encode_categorical_value(row[target], encoding_dict=encoding_dict)
            else:
                value = row[target]

            prompts.append(
                {
                    "prompt": prompt_prefix
                    + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                        property=target_name, text=row[representation_col]
                    ),
                    "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                        value=value
                    ),
                }
            )

    return pd.DataFrame(prompts)


NameError: name '_DEFAULT_ENCODING_DICT' is not defined