In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split

from gptchem.data import get_mof_solvent_data
from gptchem.formatter import MOFSolventRecommenderFormatter
from gptchem.querier import Querier
from gptchem.tuner import Tuner

In [3]:
data = get_mof_solvent_data()

In [4]:
solvent_columns = ["solvent1", "solvent2", "solvent3", "solvent4", "solvent5"]
solvent_molrations = [
    "sol_molratio1",
    "sol_molratio2",
    "sol_molratio3",
    "sol_molratio4",
    "sol_molratio5",
]
metal = ["core_All_Metals"]
counter_ions = ["counterions1"]
linkers = ["linker_1", "linker_2"]

In [5]:
recommender = MOFSolventRecommenderFormatter(
    linker_columns=["linker_1", "linker_2"],
    node_columns=["core_All_Metals"],
    counter_ion_columns=["counterions1"],
    solvent_columns=["solvent1", "solvent2", "solvent3", "solvent4", "solvent5"],
    solvent_mol_ratio_columns=[
        "sol_molratio1",
        "sol_molratio2",
        "sol_molratio3",
        "sol_molratio4",
        "sol_molratio5",
    ],
)

In [12]:
formatted = recommender(data)

In [13]:
formatted

Unnamed: 0,prompt,completion,label,representation,solvents,solvent_mol_ratios
0,In what solvent will [O-]C(=O)c1cc([N][N]c2cc(...,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O@@@,0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O,[[[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)O)C(=O)O...,"[CN(C)C=O, C(CO)O, O, nan, nan]","[0.532, 0.184, 0.284, nan, nan]"
1,In what solvent will O=C(c1cncc(c1)C(=O)Nc1ccc...,0.54 CN(C)C=O and 0.46 O@@@,0.54 CN(C)C=O and 0.46 O,[[O=C(c1cncc(c1)C(=O)Nc1ccc2c(c1)ccc(c2)C(=O)[...,"[CN(C)C=O, O, nan, nan, nan]","[0.539, 0.461, nan, nan, nan]"
2,In what solvent will [N]1C=NN=C1 and ZnF react...,1.0 O@@@,1.0 O,"[[[N]1C=NN=C1, nan], [Zn], [F], [O, nan, nan, ...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
3,In what solvent will [O-]C(=O)c1ccc(cc1)c1cccc...,0.56 CN(C)C=O and 0.34 O and 0.1 CCO@@@,0.56 CN(C)C=O and 0.34 O and 0.1 CCO,"[[[O-]C(=O)c1ccc(cc1)c1cccc(c1)C(=O)[O-], nan]...","[CN(C)C=O, O, CCO, nan, nan]","[0.556, 0.339, 0.105, nan, nan]"
4,In what solvent will [O-]C(=O)c1cc([N][N]c2cc(...,0.38 CN(C)C=O and 0.3 CCO and 0.32 O@@@,0.38 CN(C)C=O and 0.3 CCO and 0.32 O,[[[O-]C(=O)c1cc([N][N]c2cc(cc(c2)C(=O)[O-])C(=...,"[CN(C)C=O, CCO, O, nan, nan]","[0.377, 0.3, 0.323, nan, nan]"
...,...,...,...,...,...,...
431,In what solvent will Cc1c(c2c(C)c(c3ccncc3)c(c...,1.0 CO@@@,1.0 CO,[[Cc1c(c2c(C)c(c3ccncc3)c(c(c2C)c2ccncc2)C)c(C...,"[CO, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
432,In what solvent will [O-]C(=O)C1C(C(=O)[O-])C(...,1.0 O@@@,1.0 O,[[[O-]C(=O)C1C(C(=O)[O-])C(C1C(=O)[O-])C(=O)[O...,"[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
433,In what solvent will [O]P(=O)(C(P(=O)([O])[O])...,1.0 O@@@,1.0 O,"[[[O]P(=O)(C(P(=O)([O])[O])(Cc1cccnc1)O)[O], n...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
434,In what solvent will [O-]C(=O)c1cccc(c1)C(=O)[...,1.0 O@@@,1.0 O,"[[[O-]C(=O)c1cccc(c1)C(=O)[O-], c1ccc(nc1)c1cc...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"


In [14]:
train, test = train_test_split(formatted, train_size=100, test_size=50, random_state=546)

In [15]:
train

Unnamed: 0,prompt,completion,label,representation,solvents,solvent_mol_ratios
67,In what solvent will Nn1c(nnc1c1cccc(c1)c1ccnc...,0.4 CO and 0.6 O@@@,0.4 CO and 0.6 O,[[Nn1c(nnc1c1cccc(c1)c1ccncc1)c1cccc(c1)c1ccnc...,"[CO, O, nan, nan, nan]","[0.4, 0.6, nan, nan, nan]"
433,In what solvent will [O]P(=O)(C(P(=O)([O])[O])...,1.0 O@@@,1.0 O,"[[[O]P(=O)(C(P(=O)([O])[O])(Cc1cccnc1)O)[O], n...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
329,In what solvent will C(Cc1ccncc1)Cc1ccncc1%2C ...,1.0 O@@@,1.0 O,"[[C(Cc1ccncc1)Cc1ccncc1, Oc1cc(cc(c1)C(=O)[O-]...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
301,In what solvent will [O-]C(=O)c1cc(cc(c1)n1cnc...,0.45 O and 0.55 CCO@@@,0.45 O and 0.55 CCO,"[[[O-]C(=O)c1cc(cc(c1)n1cncc1)n1cncc1, nan], [...","[O, CCO, nan, nan, nan]","[0.446, 0.554, nan, nan, nan]"
218,In what solvent will C1=N[CH]N([N]1)Cc1ccc(cc1...,1.0 O@@@,1.0 O,[[C1=N[CH]N([N]1)Cc1ccc(cc1)c1ccc(cc1)Cn1cncn1...,"[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
...,...,...,...,...,...,...
17,In what solvent will Cl[Mn]123(Cl)[N]4=C5C=CC4...,0.27 CCN(CC)C=O and 0.73 CO@@@,0.27 CCN(CC)C=O and 0.73 CO,[[Cl[Mn]123(Cl)[N]4=C5C=CC4=C(c4n2c(C(=C2[N]1=...,"[CCN(CC)C=O, CO, nan, nan, nan]","[0.268, 0.732, nan, nan, nan]"
394,In what solvent will O=C(c1ccncc1)Nc1cc(cc(c1)...,1.0 CN(C)C=O@@@,1.0 CN(C)C=O,[[O=C(c1ccncc1)Nc1cc(cc(c1)C(=O)[O-])C(=O)[O-]...,"[CN(C)C=O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
165,In what solvent will [O-]C(=O)CN(C1=N[C](=O)=[...,1.0 O@@@,1.0 O,[[[O-]C(=O)CN(C1=N[C](=O)=[N]=C([N]1)N(CP(=O)(...,"[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"
189,In what solvent will [O-]C(=O)c1ccc(cc1)C(=O)[...,1.0 O@@@,1.0 O,"[[[O-]C(=O)c1ccc(cc1)C(=O)[O-], n1ccc(cc1)CCc1...","[O, nan, nan, nan, nan]","[1.0, nan, nan, nan, nan]"


In [16]:
train.iloc[0]["prompt"]

'In what solvent will Nn1c(nnc1c1cccc(c1)c1ccncc1)c1cccc(c1)c1ccncc1 and CdBF4 react?###'

In [17]:
tuner = Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False)

In [18]:
tuner(train)

Upload progress: 100%|██████████| 43.4k/43.4k [00:00<00:00, 34.0Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/mof_solvent/out/20230130_104520/train.jsonl: file-c98F86tR5PzXUjwcxhOgMEi1


ValueError: Fine tuning failed. Result: {
  "created_at": 1675071926,
  "events": [
    {
      "created_at": 1675071926,
      "level": "info",
      "message": "Created fine-tune: ft-5ZXxmaNT8bEtrjgifRvFwX2H",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.02,
    "n_epochs": 8,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-5ZXxmaNT8bEtrjgifRvFwX2H",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 43437,
      "created_at": 1675071924,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/mof_solvent/out/20230130_104520/train.jsonl",
      "id": "file-c98F86tR5PzXUjwcxhOgMEi1",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1675071926,
  "validation_files": []
}.

In [None]:
querier = Querier.from_preset(tune_res["model_name"])
completions = querier(test)

NameError: name 'tune_res' is not defined

In [None]:
import re

In [None]:
s = "0.53 CN(C)C=O and 0.18 C(CO)O and 0.28 O@@@"

In [None]:
find_solvents = re.compile(r"(\d+\.\d+)(\s[\w\(\)=\@]+)")

In [18]:
find_solvents.findall(s)

[('0.53', ' CN(C)C=O'), ('0.18', ' C(CO)O'), ('0.28', ' O@@@')]