# Basic development and testing of the polymer inverse design case study

In [1]:
%reload_ext autoreload
%autoreload 2

In [5]:
import time 
import pandas as pd 
import matplotlib.pyplot as plt 
plt.style.use(['nature', 'science'])

from gpt3forchem.data import get_polymer_data
from gpt3forchem.input import create_single_property_inverse_polymer_prompts
from gpt3forchem.output import polymer_string2performance, get_inverse_polymer_metrics

from sklearn.model_selection import train_test_split

from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction

We will keep some of the data for "testing" to have some "independent prompts"

In [6]:
df = get_polymer_data()
train_df, test_df = train_test_split(df, train_size=.9, random_state=None, stratify=df["deltaGmin_cat"])

In [7]:
train_df

Unnamed: 0.1,Unnamed: 0,smiles,string,deltaGmin,A2_normalized,deltaGmin_cat,A2_normalized_cat,num_[W],max_[W],num_[Tr],...,[W],[W].1,[Tr],[Tr].1,[Ta],[Ta].1,[R],[R].1,rel_shannon,length
1379,1379,[Ta][Tr][W][R][R][Ta][R][Tr][Tr][Tr][W][Ta][R]...,A-B-W-R-R-A-R-B-B-B-W-A-R-A-R-R-W-B-W-B-R-R,-11.432777,0.176398,medium,large,0.000000,0,0.250000,...,4.0,0.181818,6.0,0.272727,4.0,0.181818,8.0,0.363636,0.434194,22
3048,3048,[W][Ta][Tr][R][Ta][R][W][Tr][Tr][W][Tr][R][R][...,W-A-B-R-A-R-W-B-B-W-B-R-R-R-W-B-W-B-A-W-R-W-R-...,-12.959665,0.071043,small,medium,0.333333,3,0.333333,...,12.0,0.285714,12.0,0.285714,6.0,0.142857,12.0,0.285714,0.361665,42
1670,1670,[W][Tr][W][W][Ta][Tr][W][R][W][Tr][R][R][Tr][R...,W-B-W-W-A-B-W-R-W-B-R-R-B-R-W-B-W-W-B-A-A-W-B-...,-9.497693,-0.075335,large,very small,0.428571,2,0.142857,...,12.0,0.333333,10.0,0.277778,6.0,0.166667,8.0,0.222222,0.378088,36
2640,2640,[W][Tr][W][W][R][Tr][W][Tr][Ta][W][R][R][R][R]...,W-B-W-W-R-B-W-B-A-W-R-R-R-R-A-W-R-W-W-R-R-B-B-...,-11.839886,0.051848,medium,medium,0.400000,2,0.200000,...,10.0,0.312500,6.0,0.187500,4.0,0.125000,12.0,0.375000,0.376571,32
2451,2451,[R][Tr][Ta][Tr][Tr][Ta][Tr][Ta][R][Ta][Tr][R][...,R-B-A-B-B-A-B-A-R-A-B-R-B-W-B-R-A-R-A-R-B-A-B-...,-15.258745,0.133633,very small,large,0.000000,0,0.666667,...,4.0,0.111111,12.0,0.333333,10.0,0.277778,10.0,0.277778,0.368903,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,549,[Tr][Tr][Tr][W][W][Ta][W][R][Tr][Ta][Ta][Tr][T...,B-B-B-W-W-A-W-R-B-A-A-B-B-A-B-A-W-B-R-R-W-W-B-...,-5.295837,0.180387,very large,large,0.500000,3,0.250000,...,12.0,0.352941,12.0,0.352941,6.0,0.176471,4.0,0.117647,0.366673,34
1914,1914,[Ta][Ta][R][R][Ta][W][R][Tr][Tr][R][W][W][W][T...,A-A-R-R-A-W-R-B-B-R-W-W-W-A-A-W-A-B-W-R-W-R-B-...,-12.626890,0.241061,medium,very large,0.166667,3,0.166667,...,8.0,0.285714,4.0,0.142857,6.0,0.214286,10.0,0.357143,0.400256,28
1656,1656,[R][Tr][Ta][R][Tr][Ta][Tr][W][Ta][R][W][Tr][Tr...,R-B-A-R-B-A-B-W-A-R-W-B-B-R-B-A-W-A-W-R-W-W-A-...,-9.568424,-0.303473,large,very small,0.333333,2,0.666667,...,6.0,0.200000,10.0,0.333333,6.0,0.200000,8.0,0.266667,0.400579,30
2770,2770,[W][Ta][R][W][Ta][Tr][R][W][R][R][Tr][W][W][W]...,W-A-R-W-A-B-R-W-R-R-B-W-W-W-W-B-W-R-R-B-W-R-A-...,-11.551982,-0.029453,medium,small,0.200000,4,0.200000,...,12.0,0.333333,8.0,0.222222,4.0,0.111111,12.0,0.333333,0.365781,36


In [8]:
train_prompts = create_single_property_inverse_polymer_prompts(
    train_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)


test_prompts = create_single_property_inverse_polymer_prompts(
    test_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)

In [9]:
train_prompts

Unnamed: 0,prompt,completion
0,what is a polymer with medium adsorption energ...,A-B-W-R-R-A-R-B-B-B-W-A-R-A-R-R-W-B-W-B-R-R@@@
1,what is a polymer with small adsorption energy...,W-A-B-R-A-R-W-B-B-W-B-R-R-R-W-B-W-B-A-W-R-W-R...
2,what is a polymer with large adsorption energy...,W-B-W-W-A-B-W-R-W-B-R-R-B-R-W-B-W-W-B-A-A-W-B...
3,what is a polymer with medium adsorption energ...,W-B-W-W-R-B-W-B-A-W-R-R-R-R-A-W-R-W-W-R-R-B-B...
4,what is a polymer with very small adsorption e...,R-B-A-B-B-A-B-A-R-A-B-R-B-W-B-R-A-R-A-R-B-A-B...
...,...,...
2807,what is a polymer with very large adsorption e...,B-B-B-W-W-A-W-R-B-A-A-B-B-A-B-A-W-B-R-R-W-W-B...
2808,what is a polymer with medium adsorption energ...,A-A-R-R-A-W-R-B-B-R-W-W-W-A-A-W-A-B-W-R-W-R-B...
2809,what is a polymer with large adsorption energy...,R-B-A-R-B-A-B-W-A-R-W-B-B-R-B-A-W-A-W-R-W-W-A...
2810,what is a polymer with medium adsorption energ...,W-A-R-W-A-B-R-W-R-R-B-W-W-W-W-B-W-R-R-B-W-R-A...


In [10]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_mof_h2o.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_mof_h2o.jsonl"

train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(valid_filename, orient="records", lines=True)


In [11]:
fine_tune(train_filename, valid_filename)   

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/bin/openai", line 8, in <module>
    sys.exit(main())
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/_openai_scripts.py", line 63, in main
    args.func(args)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/cli.py", line 545, in sync
    resp = openai.wandb_logger.WandbLogger.sync(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 74, in sync
    fine_tune_logged = [
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 75, in <listcomp>
    cls._log_fine_tune(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 125, in _log_fine_tune
    wandb_run = cls._get_wandb_run(run_path)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/pyth

'ada:ft-lsmoepfl-2022-09-15-21-21-31'

In [27]:
completions = query_gpt3('ada:ft-lsmoepfl-2022-09-14-17-43-30', test_prompts, max_tokens=200)

In [30]:
predictions = [extract_prediction(completions, i) for i, completion in enumerate(completions["choices"])]

In [31]:
predictions

['W-W-R-W-R-W-A-W-R-B-W-A-R-B-W-A-R-B-W-A-R-B-W-R-A-B-W-B-R-A-B',
 'B-B-A-B-A-B-A-B-W-R-W-A-B-R-A-B-W-R-A-W-B-R-B',
 'B-R-B-B-R-B-R-B-R-W-B-A-R-B-W-A-R-B-W-A-R-B-W-A-R-B-W-R-A-B-R-W-A-B-R',
 'B-R-R-B-R-B-R-B-W-R-B-R-W-B-R-W-A-B-R-W-B-A-R-W-B-A-R-B-W-A-R',
 'A-R-R-A-R-A-R-A-W-R-A-R-W-A-R-W-A-B-A-W-R-B-W-R-A-B-A-W-R-B-A',
 'B-B-B-W-B-W-B-W-W-B-W-B-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W',
 'B-B-B-B-R-B-R-B-A-R-B-A-R-B-W-A-R-B-W-A-R-B-A-W-R-B-W-R-B-A-R',
 'R-R-R-R-R-B-R-B-A-R-B-A-R-B-A-B-W-R-A-B-W-R-A-B-R-W-R-B-A-W-R',
 'R-R-R-R-R-B-A-R-A-B-A-R-B-A-B-R-A-B-R-W-A-B-W-R-A-R-B-W-R-A-B-W-R',
 'R-R-R-A-R-A-R-A-R-A-R-B-W-A-A-R-W-B-A-R-W-B-A-R-W-B-R',
 'B-B-B-B-A-B-R-B-A-R-W-B-A-R-W-A-B-R-A-B-W-R-B-A-R-W-B',
 'W-R-W-R-W-A-W-B-R-W-A-B-W-R-A-B-W-R-A-W-R-B-A-W-R-B-A-W-R-B-W-R-B-A-W-B-R-A-R',
 'W-B-W-B-R-W-B-A-W-R-B-W-R-B-A-W-R-B-A-W-R-B-A-W-R-B-A-W-R-B-W-A-R-B-W-R-A-B-W-R-B-A-R',
 'B-R-R-B-R-B-R-B-R-A-B-W-A-R-B-W-A-B-R-W-B-R-A-W-B',
 'A-A-A-A-B-W-R-B-W-A-B-W-A-B-W-R-A-B-W-R-A-W-R-B-A-W-R-B-A-W-R-B-W-R-B-A-W-

In [4]:
polymer_string2performance('R-R-B-A-B-R-A-B-R-A-B-R-B-A-R-B-A-R-B-A-R-B-A-W-R-B-A-W-R-B-W-R-A-B-R-A-B-W-R')

{'monomer_squence': 'R-R-B-A-B-R-A-B-R-A-B-R-B-A-R-B-A-R-B-A-R-B-A-W-R-B-A-W-R-B-W-R-A-B-R-A-B-W-R',
 'composition': {'R': 13, 'B': 12, 'A': 10, 'W': 4},
 'smiles': '[R][R][Tr][Ta][Tr][R][Ta][Tr][R][Ta][Tr][R][Tr][Ta][R][Tr][Ta][R][Tr][Ta][R][Tr][Ta][W][R][Tr][Ta][W][R][Tr][W][R][Ta][Tr][R][Ta][Tr][W][R]',
 'prediction': array([-9.120977], dtype=float32)}