In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split

from gptchem.data import get_matbench_is_metal, get_photoswitch_data
from gptchem.extractor import (
    FewShotClassificationExtractor,
    FewShotRegressionExtractor,
    RegressionExtractor,
)
from gptchem.formatter import FewShotFormatter
from gptchem.querier import Querier

In [3]:
PROMPT_TEMPLATE = """Does {smiles} have a large transition wavelength?

Examples:
---------
{examples}
"""

In [4]:
data = get_photoswitch_data()

In [6]:
data = data.dropna(subset=["E isomer pi-pi* wavelength in nm", "SMILES"])

In [9]:
data["transition_wavelength"] = pd.qcut(
    data["E isomer pi-pi* wavelength in nm"], q=2, labels=[0, 1]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['transition_wavelength'] = pd.qcut(


In [12]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
def create_example_string(
    data,
    num_examples: int = 10,
    representation_col: str = "SMILES",
    value_col: str = "transition_wavelength",
):
    examples = []
    for i, row in data.sample(num_examples).iterrows():
        examples.append(f"- {row[representation_col]}: {row[value_col]}")
    return "\n".join(examples)

In [14]:
print(create_example_string(data))

- CCN(CC)C1=CC=C(/N=N/C2=CC(C#N)=CC=C2)C=C1: 1
- OC%14=C%15N=CC=CC%15=C(/N=N/C%16=NC(C%17=CC=CC=C%17)=CS%16)C=C%14: 1
- CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC(F)=CC=C2: 1
- CC1=CC(/N=N/C2=CC=CC=C2)=CC=C1: 0
- CC1=NOC(C)=C1/N=N/C2=CC(NC(C)=O)=CC=C2: 0
- FC1=CC=C(/N=N/C2=CC=C(F)C=C2)C=C1: 0
- NC1=CC=C(/N=N/C2=CC=C([N+]([O-])=O)C=C2)C=C1: 1
- BrC1=CC=CC(/N=N/C2=CC=CC(Br)=C2)=C1: 0
- OC1=CC=C(/N=N/C2=CC=CC=C2)C=C1: 0
- CN(C)C(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2: 1


In [16]:
prompt = PROMPT_TEMPLATE.format(
    smiles=test["SMILES"].iloc[2], examples=create_example_string(train, 50)
)

In [18]:
print(prompt)

Does FC1=C(F)C=C(F)C(F)=C1/N=N/C2=C(F)C(F)=CC(F)=C2F have a large transition wavelength?

Examples:
---------
- CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=CC=C2: 1
- CC9=CC(NCCC#N)=CC=C9/N=N/C%10=CC=CC=C%10: 1
- CC%15=CC(N(CCO)CCC#N)=CC=C%15/N=N/C%16=CC=CC=C%16: 1
- [H]N(C)C(C=C1)=CC=C1/N=N/C2=CC=C([N+]([O-])=O)C=C2: 1
- O=C(C)NC(C=C%31)=CC=C%31N=NC%32=NC%33=CC=C([N+]([O-])=O)C=C%33S%32: 1
- [H]C7=CC([N+]([O-])=O)=CC(C#N)=C7/N=N/C8=CC(OC)=C(C=C8)N(CC)CC: 1
- [H]C1=CC([N+]([O-])=O)=CC(C#N)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC: 1
- CN(C)C1=CC=C(/N=N/C2=CC=CC=C2)C=C1: 1
- Sc1[nH]nc(n1)N=Nc1c(C)n(c2c1cccc2)C: 1
- CC1=CC=C(/N=N/C2=CC=CS2)C=C1: 0
- ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC(OC)=C(C=C%12)N(CC)CC: 1
- O=[N+]([O-])C1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1: 1
- CC1=C(C(C)=NN1)/N=N/C2=CC(C(O)=O)=CC=C2: 0
- CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=C(C(F)(F)F)C=CC=C2: 1
- NC1=CC=C(/N=N/C2=CC=C(N(CC)CC)C=C2)C=C1: 1
- OC1=C(/N=N/C2=CC=C(NC(C)=O)C=C2)C=C(C)C=C1: 0
- CSC(C=C%10)=CC=C%10N=NC%11=N

In [19]:
querier = Querier("ada", max_tokens=600)

In [21]:
formatter = FewShotFormatter(
    train.sample(10),
    "transition wavelengths of photoswitch molecules",
    "SMILES",
    "transition_wavelength",
)

In [23]:
formatter

gptchem.formatter.FewShotFormatter(representation_column='SMILES', label_column='transition_wavelength', property_name='transition wavelengths of photoswitch molecules')

In [24]:
formatter(train)

Unnamed: 0,prompt,completion,label,representation
0,I am a highly intelligent question answering b...,1,1,CCN(CCC#N)C(C=C%19)=CC=C%19/N=N/C%20=CC=CC=C%20
1,I am a highly intelligent question answering b...,0,0,CC1=C(C(C)=NN1)/N=N/C2=C(OC)C=CC=C2
2,I am a highly intelligent question answering b...,1,1,CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C([N+]([...
3,I am a highly intelligent question answering b...,0,0,ClC(C=C%13)=CC=C%13N=NC%14=NC%15=CC=C([N+]([O-...
4,I am a highly intelligent question answering b...,1,1,[H]C1=CC([N+]([O-])=O)=CC(C#N)=C1/N=N/C2=CC([H...
...,...,...,...,...
307,I am a highly intelligent question answering b...,0,0,CC1=C(C(C)=NN1)/N=N/C2=CC(NC(C)=O)=CC=C2
308,I am a highly intelligent question answering b...,0,0,C1(/N=N/C2=CC=CC=C2)=CC=CC=C1
309,I am a highly intelligent question answering b...,1,1,N#CCCNC(C=C1)=CC=C1/N=N/C2=CC=CC=C2C#N
310,I am a highly intelligent question answering b...,1,1,[H]C%25=CC=C(N=C(N=NC%26=CC=C(SC)C=C%26)S%27)C...


In [26]:
prompt = formatter(test.sample(10))

In [27]:
prompt

Unnamed: 0,prompt,completion,label,representation
0,I am a highly intelligent question answering b...,1,1,O=[N+]([O-])C(C=C1)=CC=C1/N=N/C2=CC=C(NC3=CC=C...
1,I am a highly intelligent question answering b...,1,1,CCc1nnc(s1)N=Nc1c2ccccc2n(c1C)C
2,I am a highly intelligent question answering b...,1,1,CC1=C(/N=N/C2=NN=C(CC)S2)C3=CC=CC=C3N1
3,I am a highly intelligent question answering b...,0,0,ClC1=CC=CC(/N=N/C2=CC=CC(Cl)=C2)=C1
4,I am a highly intelligent question answering b...,1,1,ClC1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1
5,I am a highly intelligent question answering b...,1,1,[H]C5=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C5/N=N...
6,I am a highly intelligent question answering b...,1,1,FC1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1
7,I am a highly intelligent question answering b...,1,1,O=C(C)NC1=CC=C(/N=N/C2=CC=C(NCCC#N)C=C2)C=C1
8,I am a highly intelligent question answering b...,0,0,BrC1=CC=C(/N=N/C2=CC=C(Br)C=C2)C=C1
9,I am a highly intelligent question answering b...,0,0,CC1=C(C(C)=NN1)/N=N/C2=CC(Br)=CC=C2


In [28]:
print(prompt.iloc[0]["prompt"])

I am a highly intelligent question answering bot that answers questions about transition wavelengths of photoswitch molecules.

Q: [H]C7=CC([N+]([O-])=O)=CC(C#N)=C7/N=N/C8=CC(OC)=C(C=C8)N(CC)CC
A: 1

Q: [H]N(CC)C(C=C3)=CC=C3/N=N/C4=CC=CC=C4
A: 1

Q: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C([N+]([O-])=O)C=C2
A: 1

Q: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)C=C2
A: 1

Q: CCN(CC)C(C=C1)=CC=C1/N=N/C2=C(C#N)C=C(C#N)C=C2
A: 1

Q: CC1=NOC(C)=C1/N=N/C2=CC(C(F)(F)F)=CC=C2
A: 0

Q: CC(NC1=CC=C(/N=N/C2=CC=CC=C2)C=C1)=O
A: 0

Q: CC1=CC=CC=C1/N=N/C2=CC=C(NCCC#N)C=C2
A: 1

Q: CC1=NOC(C)=C1/N=N/C2=CC=C(OC)C=C2
A: 0

Q: CC1=C(C(C)=NN1)/N=N/C2=CC(F)=CC=C2
A: 0

Q: O=[N+]([O-])C(C=C1)=CC=C1/N=N/C2=CC=C(NC3=CC=CC=C3)C=C2


In [33]:
import openai

In [41]:
querier = Querier("gpt-4", max_tokens=100)
completions = querier(prompt)

Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

In [37]:
completions["choices"][0]

['\nA: 0\n\nQ: CC1=CC=CC=C1/N=N/C2=CC=C(NC3=CC=CC=C3)C=C2\nA: 0\n\nQ: CC1=CC=CC=C1/N=N/C2=CC=C(NC3=CC=CC=C3)C=C1\nA: 0\n\nQ: CC1=CC=CC',
 '=C2\nA: 1\n\nQ: CC1=C(C=C1/N=N/C2=CC=C(C#N)C=C2)=C1/N=N/C2=CC=C(C#N)C=C2\nA: 1\n\nQ: CC1=C(C=C1/N=N/C2=CC=C(C#N)C=C2',
 '=CC=C3/N=N/C4=CC=CC=C4\nA: 1\n\nQ: CC1=C(/N=N/C2=NN=C(CC)S2)C3=CC=CC=C3N1=CC=C3/N=N/C4=CC=CC=C4\nA: 1\n\nQ: CC1=C(/N=N/C2=',
 '/N=N/C2=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=CC=',
 ')=O\nA: 0\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)C=C2\nA: 1\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=',
 'O\nA: 1\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)C=C2\nA: 1\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O',
 '\nA: 0\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)C=C2\nA: 1\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)',
 ')=O\nA: 0\n\nQ: CC(C=C(N(CCC#N)CCO)C=C1)=C1/N=N/C2=CC=C(C(C)=O)C=C2\nA: 1\

In [38]:
test[test["SMILES"] == "O=[N+]([O-])C(C=C1)=CC=C1/N=N/C2=CC=C(NC3=CC=CC=C3)C=C2"]

Unnamed: 0,SMILES,rate of thermal isomerisation from Z-E in s-1,Solvent used for thermal isomerisation rates,Z PhotoStationaryState,E PhotoStationaryState,E isomer pi-pi* wavelength in nm,Extinction,E isomer n-pi* wavelength in nm,Extinction coefficient in M-1 cm-1,Z isomer pi-pi* wavelength in nm,...,CAM-B3LYP/6-31G** DFT Z isomer n-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT E isomer n-pi* wavelength in nm,BHLYP/6-31G* Z isomer pi-pi* wavelength in nm,BHLYP/6-31G* DFT Z isomer n-pi* wavelength in nm,name,selfies,wavelength_cat,inchi,transition_wavelength
194,O=[N+]([O-])C(C=C1)=CC=C1/N=N/C2=CC=C(NC3=CC=C...,,,,,483.0,,,,,...,,,,,,4-(4-nitrophenyl)diazenyl-N-phenylaniline,[O][=N+1][Branch1][C][O-1][C][Branch1][Ring1][...,large,InChI=1S/C18H14N4O2/c23-22(24)18-12-10-17(11-1...,1
