In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split

from gptchem.data import get_mof_solvent_data
from gptchem.formatter import MOFSolventRecommenderFormatter
from gptchem.querier import Querier
from gptchem.tuner import Tuner

In [3]:
data = get_mof_solvent_data()

In [4]:
solvent_columns = ["solvent1", "solvent2", "solvent3", "solvent4", "solvent5"]
solvent_molrations = [
    "sol_molratio1",
    "sol_molratio2",
    "sol_molratio3",
    "sol_molratio4",
    "sol_molratio5",
]
metal = ["core_All_Metals"]
counter_ions = ["counterions1"]
linkers = ["linker_1", "linker_2"]

In [5]:
recommender = MOFSolventRecommenderFormatter(
    linker_columns=["linker_1", "linker_2"],
    node_columns=["core_All_Metals"],
    counter_ion_columns=["counterions1"],
    solvent_columns=["solvent1", "solvent2", "solvent3", "solvent4", "solvent5"],
    solvent_mol_ratio_columns=[
        "sol_molratio1",
        "sol_molratio2",
        "sol_molratio3",
        "sol_molratio4",
        "sol_molratio5",
    ],
)

In [6]:
formatted = recommender(data)

In [7]:
formatted

Unnamed: 0,prompt,completion,label,representation,solvents,solvent_mol_ratios
0,What solvent shall I use to make a metal-organ...,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O@@@,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O,[[[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)O)C(=O)O...,"[CN(C)C=O, C(CO)O, O, nan, nan]","[0.532, 0.184, 0.284, nan, nan]"
1,What solvent shall I use to make a metal-organ...,0.54 CN(C)C=O and 0.46 O@@@,0.54 CN(C)C=O and 0.46 O,[[O=C(c1cncc(c1)C(=O)Nc1ccc2c(c1)ccc(c2)C(=O)[...,"[CN(C)C=O, O, nan, nan, nan]","[0.539, 0.461, nan, nan, nan]"
2,What solvent shall I use to make a metal-organ...,1.0 O@@@,1.0 O,"[[[N]1C=NN=C1, nan], [Zn], [F], [O, nan, nan, ...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
3,What solvent shall I use to make a metal-organ...,0.56 CN(C)C=O and 0.34 O and 0.1 CCO@@@,0.56 CN(C)C=O and 0.34 O and 0.1 CCO,"[[[O-]C(=O)c1ccc(cc1)c1cccc(c1)C(=O)[O-], nan]...","[CN(C)C=O, O, CCO, nan, nan]","[0.556, 0.339, 0.105, nan, nan]"
4,What solvent shall I use to make a metal-organ...,0.38 CN(C)C=O and 0.3 CCO and 0.32 O@@@,0.38 CN(C)C=O and 0.3 CCO and 0.32 O,[[[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=...,"[CN(C)C=O, CCO, O, nan, nan]","[0.377, 0.3, 0.323, nan, nan]"
...,...,...,...,...,...,...
431,What solvent shall I use to make a metal-organ...,1.0 CO@@@,1.0 CO,[[Cc1c(c2c(C)c(c3ccncc3)c(c(c2C)c2ccncc2)C)c(C...,"[CO, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
432,What solvent shall I use to make a metal-organ...,1.0 O@@@,1.0 O,[[[O-]C(=O)C1C(C(=O)[O-])C(C1C(=O)[O-])C(=O)[O...,"[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
433,What solvent shall I use to make a metal-organ...,1.0 O@@@,1.0 O,"[[[O]P(=O)(C(P(=O)([O])[O])(Cc1cccnc1)O)[O], n...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
434,What solvent shall I use to make a metal-organ...,1.0 O@@@,1.0 O,"[[[O-]C(=O)c1cccc(c1)C(=O)[O-], c1ccc(nc1)c1cc...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"


In [8]:
train, test = train_test_split(formatted, train_size=100, test_size=50, random_state=42)

In [9]:
train

Unnamed: 0,prompt,completion,label,representation,solvents,solvent_mol_ratios
382,What solvent shall I use to make a metal-organ...,0.82 CN(C)C=O and 0.18 O@@@,0.82 CN(C)C=O and 0.18 O,[[[O-]C(=O)c1cc(C#Cc2ccc(cc2)C#Cc2cc(cc(c2)C(=...,"[CN(C)C=O, O, nan, nan, nan]","[0.824, 0.176, nan, nan, nan]"
15,What solvent shall I use to make a metal-organ...,0.34 CN(C)C=O and 0.66 CO@@@,0.34 CN(C)C=O and 0.66 CO,[[[O-]C(=O)c1cc(cc(c1)C(=O)[O-])N(Cc1ccc(cc1)C...,"[CN(C)C=O, CO, nan, nan, nan]","[0.345, 0.655, nan, nan, nan]"
404,What solvent shall I use to make a metal-organ...,1.0 C1CCOC1@@@,1.0 C1CCOC1,"[[[C]#N, nan], [Cu], [CN], [C1CCOC1, nan, nan,...","[C1CCOC1, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
126,What solvent shall I use to make a metal-organ...,0.6 CN(C)C=O and 0.4 CCO@@@,0.6 CN(C)C=O and 0.4 CCO,"[[[O-]C(=O)c1ccncc1, nan], [Cu,Mn], [I], [CN(C...","[CN(C)C=O, CCO, nan, nan, nan]","[0.601, 0.399, nan, nan, nan]"
0,What solvent shall I use to make a metal-organ...,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O@@@,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O,[[[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)O)C(=O)O...,"[CN(C)C=O, C(CO)O, O, nan, nan]","[0.532, 0.184, 0.284, nan, nan]"
...,...,...,...,...,...,...
109,What solvent shall I use to make a metal-organ...,0.6 CN(C)C=O and 0.4 CCO@@@,0.6 CN(C)C=O and 0.4 CCO,[[[O-]C(=O)c1ccc(cc1)c1cc(C)c(cc1C)c1ccc(cc1)C...,"[CN(C)C=O, CCO, nan, nan, nan]","[0.601, 0.399, nan, nan, nan]"
430,What solvent shall I use to make a metal-organ...,1.0 CN(C)C=O@@@,1.0 CN(C)C=O,[[[O-]C(=O)c1cc(cc(c1)C(=O)[O-])n1nnc(c1)c1ccc...,"[CN(C)C=O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
145,What solvent shall I use to make a metal-organ...,0.34 CN(C)C=O and 0.66 CO@@@,0.34 CN(C)C=O and 0.66 CO,"[[[O-]C(=O)c1ccc2c(c1)ccc(c2)C(=O)[O-], nan], ...","[CN(C)C=O, CO, nan, nan, nan]","[0.345, 0.655, nan, nan, nan]"
237,What solvent shall I use to make a metal-organ...,0.68 O and 0.32 CN(C)C=O@@@,0.68 O and 0.32 CN(C)C=O,[[[O-]C(=O)c1ccc(cc1)[Si](c1ccc(cc1)C(=O)[O-])...,"[O, CN(C)C=O, nan, nan, nan]","[0.681, 0.319, nan, nan, nan]"


In [10]:
tuner = Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False)
tuner(train)

Upload progress: 100%|██████████| 47.9k/47.9k [00:00<00:00, 29.5Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/mof_solvent/out/20230129_170452/train.jsonl: file-s37DfGBk4NmAsfFqhHeOCKvj


ValueError: Fine tuning failed. Result: {
  "created_at": 1675008295,
  "events": [
    {
      "created_at": 1675008295,
      "level": "info",
      "message": "Created fine-tune: ft-M6P04P1O1cIWZkQrJTFQj5Ng",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.02,
    "n_epochs": 8,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-M6P04P1O1cIWZkQrJTFQj5Ng",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 47892,
      "created_at": 1675008294,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/mof_solvent/out/20230129_170452/train.jsonl",
      "id": "file-s37DfGBk4NmAsfFqhHeOCKvj",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1675008295,
  "validation_files": []
}.

In [None]:
querier = Querier.from_preset(tune_res["model_name"])
completions = querier(test)

NameError: name 'tune_res' is not defined

In [None]:
import re

In [None]:
s = "0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O@@@"

In [None]:
find_solvents = re.compile(r"(\d+\.\d+)(\s[\w\(\)=\@]+)")

In [18]:
find_solvents.findall(s)

[('0.53', ' CN(C)C=O'), ('0.18', ' C(CO)O'), ('0.28', ' O@@@')]