# Try inverse design of photoswitches

Let's still use the five bins we had for the forward task and then see if we ca generate valid SMILES.

In [1]:
from gpt3forchem.data import get_photoswitch_data
from gpt3forchem.input import create_single_property_forward_prompts
from sklearn.model_selection import train_test_split
from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction, ensemble_fine_tune, multiple_query_gpt3
import time
from pycm import ConfusionMatrix
from gpt3forchem.baselines import GPRBaseline, compute_fragprints
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

plt.style.use(["science", "nature"])


  warn(
  warn(
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = get_photoswitch_data()

We will also need to get the composition of the molecule to avoid that it doesn't only remember the wavelength

In [3]:
from rdkit import Chem
mol = Chem.AddHs(Chem.MolFromSmiles('C(=O)O'))
from collections import Counter
c = Counter(atom.GetSymbol() for atom in mol.GetAtoms())

In [4]:
data['SMILES']

0                           C[N]1N=NC(=N1)N=NC2=CC=CC=C2
1                           C[N]1C=NC(=N1)N=NC2=CC=CC=C2
2                           C[N]1C=CC(=N1)N=NC2=CC=CC=C2
3                        C[N]1C=C(C)C(=N1)N=NC2=CC=CC=C2
4                           C[N]1C=C(C=N1)N=NC2=CC=CC=C2
                             ...                        
387    OC%38=C%39N=CC=CC%39=C(/N=N/C%40=NC%41=CC(C)=C...
388    OC%42=C%43N=CC=CC%43=C(/N=N/C%44=NC%45=CC=CC=C...
389    N#CC1C(SC(/N=N/C2=NC(C=CC([N+]([O-])=O)=C3)=C3...
390    N#Cc5c(c6ccc(Cl)cc6)c(/N=N/C7=NC(C=CC([N+]([O-...
391    N#CC9C(SC(/N=N/C%10=NC(C=CC([N+]([O-])=O)=C%11...
Name: SMILES, Length: 392, dtype: object

Perhaps the best case for "inverse" design is to use the $\pi$-$\pi^*$ transition and the $n$-$\pi^*$ transition. 
We simply will train on both and see if it can then answer a corresponding prompt.

In [5]:
PROMPT_TEMPLATE_w_n_pistar = "What is a molecule pi-pi* transition wavelength of {} nm and n-pi* transition wavelength of {} nm###"

In [6]:
PROMPT_TEMPLATE = "What is a molecule pi-pi* transition wavelength of {} nm###"

In [7]:
COMPLETION_TEMPLATE = "{}@@@"

In [8]:
prompts =[]
completions = []

for i, row in data.iterrows():
    if np.isnan(row['E isomer n-pi* wavelength in nm']):
        prompt = PROMPT_TEMPLATE.format(row['E isomer pi-pi* wavelength in nm'])
    else:
        prompt = PROMPT_TEMPLATE_w_n_pistar.format(row['E isomer pi-pi* wavelength in nm'], row['E isomer n-pi* wavelength in nm'])
    
    completion = COMPLETION_TEMPLATE.format(row['SMILES'])
    prompts.append(prompt)
    completions.append(completion)

In [9]:
prompts = pd.DataFrame({'prompt': prompts, 'completion': completions})

In [14]:
train_prompts, test_prompts = train_test_split(prompts, test_size=0.05, random_state=42)

In [15]:
train_size = len(train_prompts)

filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = (
    f"run_files/{filename_base}_train_prompts_photoswitch_inverse_{train_size}.jsonl"
)
test_filename = (
    f"run_files/{filename_base}_test_prompts_photoswitch_inverse_{train_size}.jsonl"
)



train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(test_filename, orient="records", lines=True)

In [16]:
fine_tune(train_filename, valid_file=test_filename)

Uploaded file from run_files/2022-09-01-22-40-59_train_prompts_photoswitch_inverse_372.jsonl: file-14E8ycJFulEto46gTf4pP61K
Uploaded file from run_files/2022-09-01-22-40-59_test_prompts_photoswitch_inverse_372.jsonl: file-ac3IcG3mrSZAOO2V7cHsJLC2
Created fine-tune: ft-ZH0U9kB3bF3ktLgCbn6xI51Y
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-09-01 22:41:15] Created fine-tune: ft-ZH0U9kB3bF3ktLgCbn6xI51Y

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-ZH0U9kB3bF3ktLgCbn6xI51Y

 
Upload progress:   0%|          | 0.00/57.8k [00:00<?, ?it/s]
Upload progress: 100%|██████████| 57.8k/57.8k [00:00<00:00, 29.1Mit/s]

Upload progress:   0%|          | 0.00/3.13k [00:00<?, ?it/s]
Upload progress: 100%|██████████| 3.13k/3.13k [00:00<00:00, 4.66Mit/s]

