# Try inverse design of photoswitches

Let's still use the five bins we had for the forward task and then see if we ca generate valid SMILES.

In [1]:
from gpt3forchem.data import get_photoswitch_data
from gpt3forchem.input import create_single_property_forward_prompts
from sklearn.model_selection import train_test_split
from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction, ensemble_fine_tune, multiple_query_gpt3
import time
from pycm import ConfusionMatrix
from gpt3forchem.baselines import GPRBaseline, compute_fragprints
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

plt.style.use(["science", "nature"])


  warn(
  warn(
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = get_photoswitch_data()

We will also need to get the composition of the molecule to avoid that it doesn't only remember the wavelength

In [3]:
from rdkit import Chem
mol = Chem.AddHs(Chem.MolFromSmiles('C(=O)O'))
from collections import Counter
c = Counter(atom.GetSymbol() for atom in mol.GetAtoms())

In [4]:
data['SMILES']

0                           C[N]1N=NC(=N1)N=NC2=CC=CC=C2
1                           C[N]1C=NC(=N1)N=NC2=CC=CC=C2
2                           C[N]1C=CC(=N1)N=NC2=CC=CC=C2
3                        C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2
4                           C[N]1C=C(C=N1)N=NC2=CC=CC=C2
                             ...                        
387    OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
388    OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
389    N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
390    N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...
391    N#CC9C(SC(/N=N/C%10=NC(C=CC([N+]([O-])=O)=C%11...
Name: SMILES, Length: 392, dtype: object

Perhaps the best case for "inverse" design is to use the $\pi$-$\pi^*$ transition and the $n$-$\pi^*$ transition. 
We simply will train on both and see if it can then answer a corresponding prompt.

In [5]:
PROMPT_TEMPLATE_w_n_pistar = "What is a molecule with a pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"

In [6]:
PROMPT_TEMPLATE = "What is a molecule with a pi-pi* transition wavelength of {} nm###"

In [7]:
COMPLETION_TEMPLATE = "{}@@@"

In [8]:
prompts =[]
completions = []

for i, row in data.iterrows():
    if np.isnan(row['E isomer n-pi* wavelength in nm']):
        prompt = PROMPT_TEMPLATE.format(row['E isomer pi-pi* wavelength in nm'])
    else:
        prompt = PROMPT_TEMPLATE_w_n_pistar.format(row['E isomer pi-pi* wavelength in nm'], row['E isomer n-pi* wavelength in nm'])
    
    completion = COMPLETION_TEMPLATE.format(row['SMILES'])
    prompts.append(prompt)
    completions.append(completion)

In [9]:
prompts = pd.DataFrame({'prompt': prompts, 'completion': completions})

In [14]:
train_prompts, test_prompts = train_test_split(prompts, test_size=0.05, random_state=42)

In [15]:
train_size = len(train_prompts)

filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = (
    f"run_files/{filename_base}_train_prompts_photoswitch_inverse_{train_size}.jsonl"
)
test_filename = (
    f"run_files/{filename_base}_test_prompts_photoswitch_inverse_{train_size}.jsonl"
)



train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(test_filename, orient="records", lines=True)

In [16]:
fine_tune(train_filename, valid_file=test_filename)

Uploaded file from run_files/2022-09-01-22-40-59_train_prompts_photoswitch_inverse_372.jsonl: file-14E8ycJFulEto46gTf4pP61K
Uploaded file from run_files/2022-09-01-22-40-59_test_prompts_photoswitch_inverse_372.jsonl: file-ac3IcG3mrSZAOO2V7cHsJLC2
Created fine-tune: ft-ZH0U9kB3bF3ktLgCbn6xI51Y
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-09-01 22:41:15] Created fine-tune: ft-ZH0U9kB3bF3ktLgCbn6xI51Y

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-ZH0U9kB3bF3ktLgCbn6xI51Y

 
Upload progress:   0%|          | 0.00/57.8k [00:00<?, ?it/s]
Upload progress: 100%|██████████| 57.8k/57.8k [00:00<00:00, 29.1Mit/s]

Upload progress:   0%|          | 0.00/3.13k [00:00<?, ?it/s]
Upload progress: 100%|██████████| 3.13k/3.13k [00:00<00:00, 4.66Mit/s]



In [32]:
completions_1 = query_gpt3('ada:ft-lsmoepfl-2022-09-01-21-01-50', test_prompts, max_tokens=80, temperature=1.0)

In [42]:
test_prompts.iloc[0]['prompt']

'What is a molecule pi-pi* transition wavelength of 321.0 nm and n-pi* transition wavelength of 424.0 nm###'

In [22]:
completions_argmax = completions

In [70]:
data[data['SMILES'] == 'C1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1']

Unnamed: 0,index,SMILES,rate of thermal isomerisation from Z-E in s-1,Solvent used for thermal isomerisation rates,Z PhotoStationaryState,E PhotoStationaryState,E isomer pi-pi* wavelength in nm,Extinction,E isomer n-pi* wavelength in nm,Extinction coefficient in M-1 cm-1,...,CAM-B3LYP/6-31G** DFT E isomer n-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer pi-pi* wavelength in nm,CAM-B3LYP/6-31G** DFT Z isomer n-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer n-pi* wavelength in nm,BHLYP/6-31G* Z isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT Z isomer n-pi* wavelength in nm,name,selfies,wavelength_cat


In [33]:
completions_1

{'choices': [<OpenAIObject at 0x2a1eb1180> JSON: {
    "finish_reason": "length",
    "index": 0,
    "logprobs": null,
    "text": "CC1=C(/N=N/C2=CC(F)=CC=C2)C(C)=NO1@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
  },
  <OpenAIObject at 0x2a1ea7d60> JSON: {
    "finish_reason": "length",
    "index": 1,
    "logprobs": null,
    "text": "[H]C6=CC=C(N=C(N=NC7=CC=CC=C7S8S9)T8)C8=C6@@@N#N=C7@@@N#C[H]C=C8@@@C8@@@C@@@C9@@@C@@@C@@@C@@@"
  },
  <OpenAIObject at 0x2a1ea7a90> JSON: {
    "finish_reason": "length",
    "index": 2,
    "logprobs": null,
    "text": "CCN(CCC#N)C(C=C%22)=CC=C%22/N=N/C%23=CC(OC)=CC=C%23@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
  },
  <OpenAIObject at 0x2a1ea75e0> JSON: {
    "finish_reason": "length",
    "index": 3,
    "logprobs": null,
    "text": "FC1=CC=CC(F)=C1/N=N/C2=CC=CC=C2@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
  },
  <OpenAIObject at 0x2a1e42f40> JSON: 

## Let's train models that can predict the wavelengths.

In [43]:
fragprints = compute_fragprints(data['SMILES'].values)

In [45]:
pi_pi_star_model = GPRBaseline()
pi_pi_star_model.fit(fragprints, data['E isomer pi-pi* wavelength in nm'].values)

  warn(
2022-09-02 07:56:44.010873: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤══════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │    value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪══════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │ -0.03669 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │ 39.794   │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  0.02808 │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧══════

In [52]:
n_pi_star_model = GPRBaseline()
n_pi_star_model.fit(fragprints[data[~data['E isomer n-pi* wavelength in nm'].isna()].index.values], data['E isomer n-pi* wavelength in nm'].values[data[~data['E isomer n-pi* wavelength in nm'].isna()].index.values])

╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤══════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │    value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪══════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │  1.87032 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │ 35.2361  │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼──────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  0.11178 │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧══════

In [54]:
def predict(smiles): 
    fragprints = compute_fragprints([smiles])
    return pi_pi_star_model.predict(fragprints)[0], n_pi_star_model.predict(fragprints)[0]

In [71]:
predict('C1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1')

(array([390.91004025]), array([446.54990223]))

In [72]:
test_prompts['prompt'].values[-2]

'What is a molecule pi-pi* transition wavelength of 404.0 nm###'