In [1]:
# | default_exp input

# Creating prompts/training data


In [21]:
# | export
import pandas as pd
from collections import Counter

_DEFAULT_ENCODING_DICT = {
    "very small": 0,
    "small": 1,
    "medium": 2,
    "large": 3,
    "very large": 4,
}

_DEFAULT_DECODING_DICT = {v: k for k, v in _DEFAULT_ENCODING_DICT.items()}


def encode_categorical_value(value, encoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return encoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


def decode_categorical_value(value, decoding_dict=_DEFAULT_DECODING_DICT):
    try:
        return decoding_dict[value]
    except KeyError:
        raise ValueError("Unknown value: %s" % value)


In [3]:
# | export
ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE = "what is the {property} of {text}###"
ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE = " {value}@@@"


In [13]:
# | export
def create_single_property_forward_prompts(
    df, # input data
    target, # target property
    target_rename_dict, # dict to rename target property from the column name in df to the target property name in the prompt
    encode_value=True, # whether to encode the value of the target property categorically
    encoding_dict=_DEFAULT_ENCODING_DICT, # mapping from numerical categories to string
    prompt_prefix="", # prefix to add to the prompt, e.g. "I am an expert chemist"
):
    prompts = []

    target_name = target
    for key, value in target_rename_dict.items():
        target_name = target_name.replace(key, value)

    for _, row in df.iterrows():
        if encode_value:
            value = encode_categorical_value(row[target], encoding_dict=encoding_dict)
        else:
            value = row[target]

        prompts.append(
            {
                "prompt": prompt_prefix
                + ONE_PROPERTY_FORWARD_PROMPT_TEMPLATE.format(
                    property=target_name, text=row["string"]
                ),
                "completion": ONE_PROPERTY_FORWARD_COMPLETION_TEMPLATE.format(
                    value=value
                ),
            }
        )

    return pd.DataFrame(prompts)


In [14]:
from gpt3forchem.data import get_polymer_data


In [15]:
create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


Unnamed: 0,prompt,completion
0,what is the adsorption energy of W-A-B-W-W-A-A...,4@@@
1,what is the adsorption energy of R-W-W-R-R-B-B...,4@@@
2,what is the adsorption energy of A-R-A-W-B-W-A...,4@@@
3,what is the adsorption energy of W-A-R-A-B-B-B...,4@@@
4,what is the adsorption energy of R-R-B-B-W-R-A...,4@@@
...,...,...
3120,what is the adsorption energy of R-W-B-W-W-B-B...,0@@@
3121,what is the adsorption energy of R-A-A-R-A-R-W...,0@@@
3122,what is the adsorption energy of W-W-R-B-W-W-A...,0@@@
3123,what is the adsorption energy of B-A-B-B-R-W-A...,1@@@


In [19]:
create_single_property_forward_prompts(
    get_polymer_data(), "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}, prompt_prefix='you are an expert chemist: '
)


Unnamed: 0,prompt,completion
0,you are an expert chemist: what is the adsorpt...,4@@@
1,you are an expert chemist: what is the adsorpt...,4@@@
2,you are an expert chemist: what is the adsorpt...,4@@@
3,you are an expert chemist: what is the adsorpt...,4@@@
4,you are an expert chemist: what is the adsorpt...,4@@@
...,...,...
3120,you are an expert chemist: what is the adsorpt...,0@@@
3121,you are an expert chemist: what is the adsorpt...,0@@@
3122,you are an expert chemist: what is the adsorpt...,0@@@
3123,you are an expert chemist: what is the adsorpt...,1@@@


## Polymers


In [7]:
# | export
POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT = (
    "what is a polymer with {class_name} {property}?###"
)
POLYMER_ONE_PROPERTY_INVERSE_COMPLETION_TEMPLATE_CAT = " {text}@@@"

POLYMER_ONE_PROPERTY_INVERSE_PROMPT_TEMPLATE_CAT_W_COMPOSITION = "what is a polymer with {class_name} {property} and {num_A} A, {num_B} B, {num_W} W, and {num_R} R?###"


In [20]:
# | export
def get_polymer_composition_dict(row):
    composition = Counter(row["string"].split("-"))
    comp_dict = {}
    for key in ["A", "B", "R", "W"]:
        try:
            count = composition[key]
        except KeyError:
            count = 0
        comp_dict[f"num_{key}"] = count
    return comp_dict
