# Basic development and testing of the polymer inverse design case study

In [59]:
%reload_ext autoreload
%autoreload 2

In [70]:
import time 
import pandas as pd 
import matplotlib.pyplot as plt 
plt.style.use(['nature', 'science'])
from tqdm import tqdm
from gpt3forchem.data import get_polymer_data
from gpt3forchem.input import create_single_property_inverse_polymer_prompts
from gpt3forchem.output import polymer_string2performance, composition_mismatch, get_polymer_prompt_data, get_continuos_binned_distance, get_prompt_compostion, get_polymer_prompt_data

from sklearn.model_selection import train_test_split

from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction
from loguru import logger

from fastcore.helpers import save_pickle

We will keep some of the data for "testing" to have some "independent prompts"

In [3]:
df = get_polymer_data()
train_df, test_df = train_test_split(df, train_size=.9, random_state=None, stratify=df["deltaGmin_cat"])

In [4]:
train_df

Unnamed: 0.1,Unnamed: 0,smiles,string,deltaGmin,A2_normalized,deltaGmin_cat,A2_normalized_cat,num_[W],max_[W],num_[Tr],...,[W],[W].1,[Tr],[Tr].1,[Ta],[Ta].1,[R],[R].1,rel_shannon,length
723,723,[W][R][Ta][Tr][R][Tr][W][R][R][Ta][W][W][Ta][T...,W-R-A-B-R-B-W-R-R-A-W-W-A-A-W-A-W-A-B-A-W-W-A-...,-9.672209,0.177360,large,large,0.600000,2,0.000000,...,12.0,0.375000,4.0,0.125000,10.0,0.312500,6.0,0.187500,0.376571,32
423,423,[W][W][Tr][W][R][Ta][W][Tr][W][W][Tr][W][W][Tr...,W-W-B-W-R-A-W-B-W-W-B-W-W-B-R-W-A-W-B-A-B-A-B-...,-6.085187,-0.046651,very large,small,1.000000,2,0.000000,...,12.0,0.375000,10.0,0.312500,6.0,0.187500,4.0,0.125000,0.376571,32
2503,2503,[Ta][R][Ta][R][R][R][Ta][R][R][W][R][R][Tr][Tr...,A-R-A-R-R-R-A-R-R-W-R-R-B-B-R-R-R-A-W-B-W-R-W-B,-14.507754,-0.366417,small,very small,0.000000,0,0.200000,...,4.0,0.166667,4.0,0.166667,4.0,0.166667,12.0,0.500000,0.390948,24
54,54,[W][Tr][Ta][R][R][Tr][Tr][R][Ta][Tr][W][Ta][W]...,W-B-A-R-R-B-B-R-A-B-W-A-W-A-A-A-A-A-W-R,-9.550020,0.572132,large,very large,0.000000,0,0.333333,...,4.0,0.200000,4.0,0.200000,8.0,0.400000,4.0,0.200000,0.444692,20
775,775,[R][W][Ta][Tr][R][R][Ta][Tr][Tr][Tr][Tr][Ta][T...,R-W-A-B-R-R-A-B-B-B-B-A-A-W-A-R-W-W-A-R-B-R,-10.166604,0.384454,large,very large,0.250000,2,0.250000,...,4.0,0.181818,6.0,0.272727,6.0,0.272727,6.0,0.272727,0.444188,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,1893,[Tr][Ta][W][W][R][W][R][R][W][W][R][R][R][W][R...,B-A-W-W-R-W-R-R-W-W-R-R-R-W-R-B-W-R-W-B-A-W-R-...,-9.389181,0.111659,large,medium,0.500000,2,0.000000,...,10.0,0.357143,4.0,0.142857,4.0,0.142857,10.0,0.357143,0.387556,28
2817,2817,[Ta][Ta][W][Ta][Tr][Tr][R][W][W][R][Ta][Tr][W]...,A-A-W-A-B-B-R-W-W-R-A-B-W-B-R-R-W-R-W-B-A-A-R-...,-14.421028,0.082798,small,medium,0.222222,3,0.222222,...,10.0,0.263158,8.0,0.210526,8.0,0.210526,12.0,0.315789,0.377003,38
1409,1409,[Tr][R][Ta][R][R][R][Ta][Tr][W][R][W][Tr][R][T...,B-R-A-R-R-R-A-B-W-R-W-B-R-A-A-W-B-R-W-B-A-B-R-...,-11.565822,-0.082278,medium,very small,0.333333,2,0.000000,...,6.0,0.230769,6.0,0.230769,6.0,0.230769,8.0,0.307692,0.422891,26
310,310,[W][W][W][Tr][W][Ta][Ta][W][Ta][Ta][Tr][Ta][R]...,W-W-W-B-W-A-A-W-A-A-B-A-R-B-A-W-R-B-B-W-A-B-B-...,-8.497218,0.246129,very large,very large,0.166667,3,0.333333,...,8.0,0.285714,8.0,0.285714,8.0,0.285714,4.0,0.142857,0.405673,28


In [5]:
train_prompts = create_single_property_inverse_polymer_prompts(
    train_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)


test_prompts = create_single_property_inverse_polymer_prompts(
    test_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)

In [6]:
train_prompts

Unnamed: 0,prompt,completion
0,what is a polymer with large adsorption energy...,W-R-A-B-R-B-W-R-R-A-W-W-A-A-W-A-W-A-B-A-W-W-A...
1,what is a polymer with very large adsorption e...,W-W-B-W-R-A-W-B-W-W-B-W-W-B-R-W-A-W-B-A-B-A-B...
2,what is a polymer with small adsorption energy...,A-R-A-R-R-R-A-R-R-W-R-R-B-B-R-R-R-A-W-B-W-R-W...
3,what is a polymer with large adsorption energy...,W-B-A-R-R-B-B-R-A-B-W-A-W-A-A-A-A-A-W-R@@@
4,what is a polymer with large adsorption energy...,R-W-A-B-R-R-A-B-B-B-B-A-A-W-A-R-W-W-A-R-B-R@@@
...,...,...
2807,what is a polymer with large adsorption energy...,B-A-W-W-R-W-R-R-W-W-R-R-R-W-R-B-W-R-W-B-A-W-R...
2808,what is a polymer with small adsorption energy...,A-A-W-A-B-B-R-W-W-R-A-B-W-B-R-R-W-R-W-B-A-A-R...
2809,what is a polymer with medium adsorption energ...,B-R-A-R-R-R-A-B-W-R-W-B-R-A-A-W-B-R-W-B-A-B-R...
2810,what is a polymer with very large adsorption e...,W-W-W-B-W-A-A-W-A-A-B-A-R-B-A-W-R-B-B-W-A-B-B...


In [7]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_mof_h2o.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_mof_h2o.jsonl"

train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(valid_filename, orient="records", lines=True)


In [8]:
fine_tune(train_filename, valid_filename)   

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/bin/openai", line 8, in <module>
    sys.exit(main())
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/_openai_scripts.py", line 63, in main
    args.func(args)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/cli.py", line 545, in sync
    resp = openai.wandb_logger.WandbLogger.sync(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 74, in sync
    fine_tune_logged = [
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 75, in <listcomp>
    cls._log_fine_tune(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 125, in _log_fine_tune
    wandb_run = cls._get_wandb_run(run_path)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/pyth

'ada:ft-lsmoepfl-2022-09-19-11-27-23'

In [11]:
completions = query_gpt3('ada:ft-lsmoepfl-2022-09-19-11-27-23', test_prompts, max_tokens=200)

In [12]:
predictions = [extract_prediction(completions, i) for i, completion in enumerate(completions["choices"])]

In [13]:
predictions

['W-R-W-R-A-W-R-A-W-R-R-W-A-R-W-A-W-R-A-B-A-W-R-B-A-W-R-B-A-W-R-B-W',
 'W-W-W-W-W-W-A-W-B-W-A-W-B-A-R-B-A-W-R-B-A-W-R-B-W-A-B-R-W',
 'W-W-B-A-W-A-W-B-A-W-R-B-W-A-B-R-A-W-B-R-A-W-R-B-A-W-R-B-W-B-A-R-W',
 'A-A-A-A-A-B-A-B-A-B-A-B-R-R-B-A-W-R-B-A-W-R-B-A-W-R-B-A-W-R-A',
 'W-R-W-R-W-R-R-W-R-W-R-W-R-B-W-R-W-B-R-W-A-B-R-W-A-B-R-W-B-A-R-B-W-A-R',
 'A-B-A-A-B-W-R-A-B-R-W-A-B-R-W-A-B-R-W-B-A-R-W-B-A-R-W-B-A-W-R-B-A-W-R-B-W-B-A-R-W-B-A-R-W',
 'A-R-A-W-R-A-R-A-R-W-R-A-W-R-A-W-R-A-B-W-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W-B-R-A-W-B-R',
 'R-R-B-R-B-R-W-A-R-B-W-A-R-B-W-A-R-B-A-W-R-B-A-W-R-B-W-R-A-B-R',
 'W-W-A-W-R-A-R-W-A-R-W-A-R-W-A-R-B-W-A-R-W-B-A-R-W-B-A-W-R-B-W-R-B-A-W-R-B-A-W',
 'W-B-B-W-R-W-B-R-W-B-W-R-B-W-R-B-W-R-A-B-W-R-B-A-W-R-B-A-W-R-B-A-W',
 'A-A-R-R-W-A-R-A-W-R-A-R-W-B-A-W-R-B-A-W-R-B-W-R-B-A-T',
 'W-W-A-W-R-A-W-R-A-W-R-A-W-R-A-W-R-B-A-W-R-B-A-W-R-B-W-A-R-B-W',
 'W-R-W-R-W-R-R-W-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W-B-W-R-A-B-R-W-A-B-R',
 'W-R-W-R-A-W-R-A-W-R-A-W-R-A-W-R-A-W-R-A-W-R-A-W-

In [14]:
polymer_string2performance('R-R-B-A-B-R-A-B-R-A-B-R-B-A-R-B-A-R-B-A-R-B-A-W-R-B-A-W-R-B-W-R-A-B-R-A-B-W-R')

{'monomer_squence': 'R-R-B-A-B-R-A-B-R-A-B-R-B-A-R-B-A-R-B-A-R-B-A-W-R-B-A-W-R-B-W-R-A-B-R-A-B-W-R',
 'composition': {'R': 13, 'B': 12, 'A': 10, 'W': 4},
 'smiles': '[R][R][Tr][Ta][Tr][R][Ta][Tr][R][Ta][Tr][R][Tr][Ta][R][Tr][Ta][R][Tr][Ta][R][Tr][Ta][W][R][Tr][Ta][W][R][Tr][W][R][Ta][Tr][R][Ta][Tr][W][R]',
 'prediction': array([-9.120977], dtype=float32)}

In [15]:
?get_inverse_polymer_metrics

[0;31mSignature:[0m
[0mget_inverse_polymer_metrics[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcompletion_texts[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_test[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdf_train[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_num_train_sequences[0m[0;34m=[0m[0;36m2000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py
[0;31mType:[0m      function


In [None]:
cm.overall_stat

{'Overall ACC': 0.352,
 'Overall RACCU': 0.260694,
 'Overall RACC': 0.204088,
 'Kappa': 0.18583964056327834,
 'Gwet AC1': 0.205077201356521,
 'Bennett S': 0.18999999999999995,
 'Kappa Standard Error': 0.026835443670701672,
 'Kappa Unbiased': 0.12350231162739109,
 'Scott PI': 0.12350231162739109,
 'Kappa No Prevalence': -0.29600000000000004,
 'Kappa 95% CI': (0.13324217096870306, 0.23843711015785363),
 'Standard Error': 0.02135865164283551,
 '95% CI': (0.3101370427800424, 0.39386295721995757),
 'Chi-Squared': 'None',
 'Phi-Squared': 'None',
 'Cramer V': 'None',
 'Response Entropy': 1.3814056651434996,
 'Reference Entropy': 2.316058449955823,
 'Cross Entropy': 1.1144477741367746,
 'Joint Entropy': 3.4137961407287363,
 'Conditional Entropy': 1.0977376907729133,
 'Mutual Information': 0.28366797437058633,
 'KL Divergence': 'None',
 'Lambda B': 0.1941747572815534,
 'Lambda A': 0.1906005221932115,
 'Chi-Squared DF': 16,
 'Overall J': (0.7983837510803802, 0.15967675021607602),
 'Hamming Loss'

In [40]:
train_prompts

Unnamed: 0,prompt,completion
0,what is a polymer with large adsorption energy...,W-R-A-B-R-B-W-R-R-A-W-W-A-A-W-A-W-A-B-A-W-W-A...
1,what is a polymer with very large adsorption e...,W-W-B-W-R-A-W-B-W-W-B-W-W-B-R-W-A-W-B-A-B-A-B...
2,what is a polymer with small adsorption energy...,A-R-A-R-R-R-A-R-R-W-R-R-B-B-R-R-R-A-W-B-W-R-W...
3,what is a polymer with large adsorption energy...,W-B-A-R-R-B-B-R-A-B-W-A-W-A-A-A-A-A-W-R@@@
4,what is a polymer with large adsorption energy...,R-W-A-B-R-R-A-B-B-B-B-A-A-W-A-R-W-W-A-R-B-R@@@
...,...,...
2807,what is a polymer with large adsorption energy...,B-A-W-W-R-W-R-R-W-W-R-R-R-W-R-B-W-R-W-B-A-W-R...
2808,what is a polymer with small adsorption energy...,A-A-W-A-B-B-R-W-W-R-A-B-W-B-R-R-W-R-W-B-A-A-R...
2809,what is a polymer with medium adsorption energ...,B-R-A-R-R-R-A-B-W-R-W-B-R-A-A-W-B-R-W-B-A-B-R...
2810,what is a polymer with very large adsorption e...,W-W-W-B-W-A-A-W-A-A-B-A-R-B-A-W-R-B-B-W-A-B-B...


In [61]:
row["prompt"]

'what is a polymer with large adsorption energy and 8 A, 12 B, 12 W, and 8 R?###'

In [67]:
?get_polymer_prompt_data

[0;31mSignature:[0m [0mget_polymer_prompt_data[0m[0;34m([0m[0mprompt[0m[0;34m,[0m [0mnumerically_encoded[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py
[0;31mType:[0m      function


In [69]:
composition, bin = get_polymer_prompt_data(row["prompt"])

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ipykernel_69760/2149671618.py", line 1, in <cell line: 1>
    composition, bin = get_polymer_prompt_data(row["prompt"])
  File "/Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py", line 191, in get_polymer_prompt_data
    return composition, get_target(prompt, numerically_encoded)
  File "/Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py", line 182, in get_target
    return int(num[0])
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 1993, in show

In [71]:
?save_pickle

[0;31mSignature:[0m [0msave_pickle[0m[0;34m([0m[0mfn[0m[0;34m,[0m [0mo[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Save a pickle file, to a file name or opened file
[0;31mFile:[0m      ~/miniconda3/envs/gpt3/lib/python3.9/site-packages/fastcore/helpers.py
[0;31mType:[0m      function


In [72]:
save_pickle('test_polymer_inverse.pkl', {'train_prompts': train_prompts, 'test_prompts': test_prompts, 'completions': completions, 'predictions': predictions})

In [63]:
df_train = train_prompts
df_test = test_prompts
losses = []
composition_mismatches = []

train_sequences = [polymer_string2performance(seq)["monomer_squence"] for seq in df_train["completion"]]
print(f"Using {len(train_sequences)} training sequences")
for i, row in tqdm(df_test.iterrows(), total=len(predictions)):
    if i < len(predictions):
        try:
            composition, bin = get_polymer_prompt_data(row["prompt"], numerically_encoded=False)
            print(f"Composition: {composition}")
            completion_data = polymer_string2performance(predictions[i])
            print(completion_data)
            loss = get_continuos_binned_distance(completion_data["prediction"][0], bin, bins)
            losses.append(loss)

            mm = composition_mismatch(composition, completion_data["composition"])

            distances = string_distances(
                train_sequences[:max_num_train_sequences], completion_data["monomer_squence"]
            )
            mm.update(completion_data)
            mm.update(distances)
            mm.update({"loss": loss})
            composition_mismatches.append(mm)
        except Exception as e:
            logger.exception(e)


Using 2812 training sequences


  0%|          | 0/313 [00:00<?, ?it/s]2022-09-19 14:15:36.281 | ERROR    | __main__:<cell line: 8>:28 - list index out of range
Traceback (most recent call last):

  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x1048ea2f0, file "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/ipykern...
           └ <function _run_code at 0x1048e4af0>
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
         └ <code object <module> at 0x1048

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ipykernel_69760/1558440684.py", line 11, in <cell line: 8>
    composition, bin = get_polymer_prompt_data(row["prompt"], numerically_encoded=False)
  File "/Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py", line 191, in get_polymer_prompt_data
    return composition, get_target(prompt, numerically_encoded)
  File "/Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/gpt3forchem/output.py", line 182, in get_target
    return int(num[0])
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ipykernel_69760/1558440684.py", line 28, in <cell 

In [42]:
metrics = get_inverse_polymer_metrics(predictions, test_prompts, train_prompts)  

yes
Using 2812 training sequences


100%|██████████| 313/313 [00:00<00:00, 26994.37it/s]

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out o




In [21]:
metrics

([],
 Empty DataFrame
 Columns: []
 Index: [])