In [1]:
from gpt3forchem.data import get_bandgap_data
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.model_selection import train_test_split

from gpt3forchem.api_wrappers import query_gpt3, extract_prediction, extract_inverse_prediction, fine_tune
from gpt3forchem.output import test_inverse_bandgap

import time

from rdkit.Contrib.SA_Score.sascorer import calculateScore as calculate_sascore

from fastcore.helpers import save_pickle

import numpy as np
from rdkit import Chem 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
plt.style.use(['science', 'nature'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = get_bandgap_data()

In [3]:
PROMPT_TEMPLATE_bandgap_inverse = "What is a molecule with a bandgap of {} eV###"
COMPLETION_TEMPLATE_bandgap_inverse = "{}@@@"


def generate_inverse_photoswitch_prompts(
    data: pd.DataFrame, representation: str = "smiles"
) -> pd.DataFrame:
    prompts = []
    completions = []
    smiles = []
    for i, row in data.iterrows():

        prompt = PROMPT_TEMPLATE_bandgap_inverse.format(
            np.round(row["GFN2_HOMO_LUMO_GAP"] * 27.2114, 1)
        )

        completion = COMPLETION_TEMPLATE_bandgap_inverse.format(row[representation])
        prompts.append(prompt)
        completions.append(completion)
        smiles.append(row["smiles"])

    prompts = pd.DataFrame(
        {"prompt": prompts, "completion": completions, "SMILES": smiles}
    )

    return prompts

In [4]:
def test_inverse_model(
    modelname,
    test_prompts,
    df_train,
    max_tokens: int = 250,
    temperatures=None,
    representation="SMILES",
):
    temperatures = temperatures or [0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5]
    train_smiles = df_train["SMILES"].to_list()
    results = []
    for temperature in temperatures:
        try:
            print(f"Testing temperature {temperature} for {representation}")
            result = test_inverse_bandgap(
                test_prompts,
                modelname,
                train_smiles=train_smiles,
                temperature=temperature,
                max_tokens=max_tokens,
                representation=representation,
            )

            results.append(result)
        except Exception as e:
            print(e)
            pass

    return results

In [16]:
modelname_random = "ada:ft-lsmoepfl-2022-11-30-15-25-12"
modelname_extrapolation = "ada:ft-lsmoepfl-2022-11-30-23-58-24"

In [None]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

In [5]:
train_set_random, test_set_random = train_test_split(data, train_size=1_000, test_size=2000, random_state=42)

In [19]:
random_bandgaps = np.random.normal(4.0, 0.5, size=2000) / 27.2114
test_set_random_biased = test_set_random.copy()
test_set_random_biased['GFN2_HOMO_LUMO_GAP'] = random_bandgaps
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())

test_prompts_random_biased = generate_inverse_photoswitch_prompts(test_set_random_biased)
valid_filename_random_biased = f"run_files/{filename_base}_valid_prompts_bandgap_inverse_smiles_random_biased.jsonl"
test_prompts_random_biased.to_json(valid_filename_random_biased, orient="records", lines=True)

In [8]:
original_train_prompts = pd.read_json('run_files/2022-11-30-15-18-01_train_prompts_bandgap_inverse_smiles_1000.jsonl', lines=True)

In [10]:
original_train_set = data['smiles'].isin(original_train_prompts['SMILES']).values

In [13]:
original_train_set = data[original_train_set]


In [20]:
random_test_results = test_inverse_model(modelname_random, test_prompts_random_biased, original_train_prompts)

Testing temperature 0 for SMILES
Error communicating with OpenAI
Testing temperature 0.25 for SMILES


2022-12-07 16:21:40.727 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)c2c([H])c([H])c(N([H])C(=O)C([H])([H])[H])c([H])c2[H])c([H])c1[H]
2022-12-07 16:21:40.732 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)c2c([H])c([H])c(N([H])C(=O)C([H])([H])[H])c([H])c2[H])c([H])c1[H]
2022-12-07 16:21:41.037 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 1000 samples
2022-12-07 16:21:41.144 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 16:21:42.137643: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-12-07 16:21:47.690 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (35.82187032088866, 0.

Testing temperature 0.5 for SMILES


2022-12-07 16:35:08.520 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1nc([H])c2c(c1[H])N(C(=O)N([H])c1c([H])c([H])c(C([H])([H])C([H])([H])N([H])[H])c([H])c1[H])C([H])([H])C2([H])[H]
2022-12-07 16:35:08.522 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1nc([H])c2c(c1[H])N(C(=O)N([H])c1c([H])c([H])c(C([H])([H])C([H])([H])N([H])[H])c([H])c1[H])C([H])([H])C2([H])[H]
2022-12-07 16:35:08.920 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 1000 samples
2022-12-07 16:35:09.040 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 16:35:14.199 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (16.7319611477375, 0.03521115900085974)
2022-12-07 16:35:18.961 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - 

Testing temperature 0.75 for SMILES
Error communicating with OpenAI
Testing temperature 1.0 for SMILES


2022-12-07 17:06:57.376 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]Oc1c([H])c([H])c2c(c1[H])c1c(C([H])([H])N([H])[H])oc([H])c1c1c([H])c([H])c(C([H])([H])P2C([H])([H])C([H])([H])N(C([H])([H])[H])C([H])([H])[H])c([H])n1[H]
2022-12-07 17:06:57.378 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]Oc1c([H])c([H])c2c(c1[H])c1c(C([H])([H])N([H])[H])oc([H])c1c1c([H])c([H])c(C([H])([H])P2C([H])([H])C([H])([H])N(C([H])([H])[H])C([H])([H])[H])c([H])n1[H]
2022-12-07 17:06:57.837 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 511 samples
2022-12-07 17:06:57.935 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 17:07:05.967 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (8.874719268047329, 0.16949296565056102)
202

Testing temperature 1.25 for SMILES
Error communicating with OpenAI
Testing temperature 1.5 for SMILES


2022-12-07 17:55:22.122 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1nc([H])c(S(=C(/oc
2022-12-07 17:55:22.125 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1nc([H])c(S(=C(/oc
2022-12-07 17:55:22.417 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 70 samples
2022-12-07 17:55:22.477 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 17:55:25.062 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (43.911545939971944, 0.00015342339237443562)
2022-12-07 17:55:25.350 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - Computed KL div score: 0.13533228045952803


In [21]:
save_pickle(f"run_files/{filename_base}-random_test_results_extrapolation_smiles_more.pkl", random_test_results)

In [23]:
smiles_random_biased_sets = []

for res in random_test_results:
    result = {}
    result["temperature"] = res["meta"]["temperature"]
    result["smiles"] = set(
        res["predictions"][res["valid_smiles"]]
    )
    result['original_prediction_indices'] = [i for i, x in enumerate(res["predictions"]) if x in result['smiles']]
    result['expected'] = [res['expectations'][i] for i in result['original_prediction_indices']]
    smiles_random_biased_sets.append(result)

for res in smiles_random_biased_sets:
    temp = res["temperature"]
    smiles_set = res["smiles"]
    with open(f"for_more_xtb_opt/{filename_base}_smiles_random_biased_sets{temp}.txt", "w") as f:
        for i, smiles in enumerate(smiles_set):
            if i != len(smiles_set) - 1:
                f.write(smiles + "\n")
            else:
                f.write(smiles)

## Extrapolation

In [26]:
train_prompts_extrapolation = pd.read_json("run_files/2022-11-30-23-19-44_train_prompts_bandgap_inverse_smiles_extrapolation_1000.jsonl", lines=True)

In [27]:
extrapolation_test_results = test_inverse_model(modelname_extrapolation, test_prompts_random_biased, train_prompts_extrapolation)

Testing temperature 0 for SMILES


2022-12-07 20:26:39.360 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c([H])c1[H]
2022-12-07 20:26:39.371 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c([H])c1[H]
2022-12-07 20:26:39.691 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 1000 samples
2022-12-07 20:26:39.792 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 20:27:18.658 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (76.38413607646115, 2.3193070034933744e-07)
2022-12-07 20:27:21.375 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - Computed KL div score: nan


Testing temperature 0.25 for SMILES


2022-12-07 20:41:03.246 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c([H])c1[H]
2022-12-07 20:41:03.248 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c([H])c1[H]
2022-12-07 20:41:03.527 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 1000 samples
2022-12-07 20:41:03.621 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 20:41:09.059 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (36.95506460520862, 0.0006167708967771995)
2022-12-07 20:41:12.740 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - Computed KL div score: 0.39533715520022045


Testing temperature 0.5 for SMILES
Error communicating with OpenAI
Testing temperature 0.75 for SMILES


2022-12-07 21:14:31.658 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c(C([H])([H])[H])c1[H]
2022-12-07 21:14:31.661 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N2C([H])([H])C([H])([H])C([H])([H])C2([H])[H])c(C([H])([H])[H])c1[H]
2022-12-07 21:14:32.095 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 960 samples
2022-12-07 21:14:32.151 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 21:14:36.181 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (8.779165287285608, 0.1727632606049706)
2022-12-07 21:14:40.178 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - Computed KL div score: 0.717685938896

Testing temperature 1.0 for SMILES


2022-12-07 21:31:41.467 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c2c(c([H])c1Cl)N=C(C(=O)c1c([H])c([H])c([H])c(SC3([H])C([H])([H])C([H])([H])C([H])([H])C3([H])[H])c1[H])C2([H])[H]
2022-12-07 21:31:41.470 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c2c(c([H])c1Cl)N=C(C(=O)c1c([H])c([H])c([H])c(SC3([H])C([H])([H])C([H])([H])C([H])([H])C3([H])[H])c1[H])C2([H])[H]
2022-12-07 21:31:41.919 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 522 samples
2022-12-07 21:31:42.003 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 21:31:44.640 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (8.59451859570079, 0.17926256234511487)
2022-12-07 21:31:46.887 | DEBUG    | gpt3forchem.output:test_inver

Testing temperature 1.25 for SMILES


2022-12-07 21:55:00.513 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]c1c([H])c(-n2c([H])c([H])c([H])c2OC([H])([H])c2c([H])c(OC([H])([H])[H])c([H])c([H])c2C([H])([H])C([H])([H])N(C([H])([H])C([H])([H])N1C([H])([H])[C@]2([H])C([H])([H])[C@@]([H])(O[H])C2([H])[H])C([H])([H])[C@@]1([H])C([H])([H])c1c([H])c([H])c(C([H])([H])C([H])([H])N([H])[C@]2([H])[H])c2nc1[H]
2022-12-07 21:55:00.519 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]c1c([H])c(-n2c([H])c([H])c([H])c2OC([H])([H])c2c([H])c(OC([H])([H])[H])c([H])c([H])c2C([H])([H])C([H])([H])N(C([H])([H])C([H])([H])N1C([H])([H])[C@]2([H])C([H])([H])[C@@]([H])(O[H])C2([H])[H])C([H])([H])[C@@]1([H])C([H])([H])c1c([H])c([H])c(C([H])([H])C([H])([H])N([H])[C@]2([H])[H])c2nc1[H]
2022-12-07 21:55:00.903 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 170 samples
2022-12-07 21:55:01.020 | INFO     | gpt3forchem.outp

Testing temperature 1.5 for SMILES


2022-12-07 22:20:27.980 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:931 - Got predictions, example: [H]00
2022-12-07 22:20:27.982 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:942 - Loaded predictions. Example: [H]00
2022-12-07 22:20:28.278 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:953 - Calculating Frechet ChemNet distance for 71 samples
2022-12-07 22:20:28.389 | INFO     | gpt3forchem.output:_load_chemnet:218 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-12-07 22:20:29.830 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:962 - Computed frechet score: (37.99478494754655, 0.0005009736818437066)
2022-12-07 22:20:30.097 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:973 - Computed KL div score: 0.22369364556531676


In [29]:
save_pickle(f"run_files/{filename_base}-extrapolation_test_results_extrapolation_smiles_more.pkl", extrapolation_test_results)

In [30]:
smiles_extrapolation_sets = []

for res in extrapolation_test_results:
    result = {}
    result["temperature"] = res["meta"]["temperature"]
    result["smiles"] = set(
        res["predictions"][res["valid_smiles"]]
    )
    result['original_prediction_indices'] = [i for i, x in enumerate(res["predictions"]) if x in result['smiles']]
    result['expected'] = [res['expectations'][i] for i in result['original_prediction_indices']]
    smiles_extrapolation_sets.append(result)

for res in smiles_extrapolation_sets:
    temp = res["temperature"]
    smiles_set = res["smiles"]
    with open(f"for_more_xtb_opt/{filename_base}_smiles_extrapolation_sets{temp}.txt", "w") as f:
        for i, smiles in enumerate(smiles_set):
            if i != len(smiles_set) - 1:
                f.write(smiles + "\n")
            else:
                f.write(smiles)