# Prepare parameter influence experiments


In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
from typing import Optional

from pathlib import Path

import pandas as pd
from fastcore.xtras import save_pickle
from sklearn.model_selection import train_test_split

from gptchem.data import get_photoswitch_data
from gptchem.evaluator import evaluate_classification
from gptchem.extractor import ClassificationExtractor
from gptchem.formatter import ClassificationFormatter
from gptchem.querier import Querier
from gptchem.tuner import Tuner

## Prepare data


In [3]:
df = get_photoswitch_data()

For now, try a binary classification


In [4]:
formatter = ClassificationFormatter(
    representation_column="SMILES",
    label_column="E isomer pi-pi* wavelength in nm",
    property_name="transition wavelength",
    num_classes=2,
    qcut=True,
)

In [5]:
formatter

gptchem.formatter.ClassificationFormatter(representation_column='SMILES', label_column='E isomer pi-pi* wavelength in nm', property_name='transition wavelength', num_classes=2, qcut=True)

In [6]:
formatted = formatter(df)

Now, let's write a simple function that will run and evaluate the experiments.
Let's run it with very little data such that we can at least sanity check before launching it on the full dataset.


In [44]:
def train_test_evaluate(
    formatted: pd.DataFrame,
    train_size: int = 10,
    test_size: int = 10,
    basemodel: str = "ada",
    n_epochs: int = 4,
    learning_rate_multiplier: Optional[int] = None,
) -> dict:
    train, test = train_test_split(
        formatted, train_size=train_size, test_size=test_size, stratify=formatted["label"]
    )

    tuner = Tuner(
        base_model=basemodel,
        n_epochs=n_epochs,
        learning_rate_multiplier=learning_rate_multiplier,
        wandb_sync=False,
    )

    tune_summary = tuner(train)

    assert isinstance(tune_summary["model_name"], str)

    querier = Querier.from_preset(tune_summary["model_name"], preset="classification")

    completions = querier(test, logprobs=2)

    extractor = ClassificationExtractor()

    extracted = extractor(completions)

    res = evaluate_classification(test["label"], extracted)

    summary = {
        **tune_summary,
        **res,
        "completions": completions,
        "train_size": train_size,
        "test_size": test_size,
    }

    save_pickle(Path(tune_summary["outdir"]) / "summary.pkl", summary)

    return summary

In [45]:
test_summary = train_test_evaluate(formatted, train_size=10, test_size=10)

Upload progress: 100%|██████████| 2.00k/2.00k [00:00<00:00, 3.86Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_170415/train.jsonl: file-JkpD3rGz5UwbBpe3GcOMeMxW


2023-01-08 17:04:17.482 | DEBUG    | gptchem.tuner:tune:184 - Requested fine tuning. {
  "created_at": 1673193857,
  "events": [
    {
      "created_at": 1673193857,
      "level": "info",
      "message": "Created fine-tune: ft-84MDXso4AF1nebi0FNF08xBC",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-84MDXso4AF1nebi0FNF08xBC",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 2002,
      "created_at": 1673193857,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_170415/train.jsonl",
      "id": "file-JkpD3rGz5UwbBpe3GcOMeMxW",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
  

In [46]:
test_summary

{'base_model': 'ada',
 'batch_size': None,
 'n_epochs': 4,
 'learning_rate_multiplier': None,
 'run_name': None,
 'wandb_sync': False,
 'outdir': '/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_170415',
 'train_filename': '/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_170415/train.jsonl',
 'valid_filename': 'None',
 'model_name': 'ada:ft-lsmoepfl-2023-01-08-16-05-39',
 'ft_id': 'ft-84MDXso4AF1nebi0FNF08xBC',
 'date': '20230108_170618',
 'train_file_id': 'file-JkpD3rGz5UwbBpe3GcOMeMxW',
 'valid_file_id': None,
 'accuracy': 0.8,
 'acc_macro': 0.8,
 'racc': 0.5,
 'kappa': 0.6000000000000001,
 'confusion_matrix': pycm.ConfusionMatrix(classes: [0, 1]),
 'f1_macro': 0.7916666666666667,
 'f1_micro': 0.8,
 'frac_valid': 1.0,
 'all_y_true': (#10) [1,0,0,0,1,1,0,1,0,1],
 'all_y_pred': (#10) [0,0,0,0,0,1,0,1,0,1],
 'valid_indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 'might_have_rounded

In [51]:
base_models = ["ada", "babbage", "curie", "davinci"]

In [52]:
results = []
for model in base_models:
    results.append(train_test_evaluate(formatted, train_size=10, test_size=10, basemodel=model))

Upload progress: 100%|██████████| 2.09k/2.09k [00:00<00:00, 2.96Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_171919/train.jsonl: file-2NHwCUVOs7UeBuzTZHrSY3xS


2023-01-08 17:19:20.977 | DEBUG    | gptchem.tuner:tune:184 - Requested fine tuning. {
  "created_at": 1673194760,
  "events": [
    {
      "created_at": 1673194760,
      "level": "info",
      "message": "Created fine-tune: ft-vxHzECazaCxdMFRsHKOPd2Ew",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-vxHzECazaCxdMFRsHKOPd2Ew",
  "model": "ada",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 2090,
      "created_at": 1673194760,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_171919/train.jsonl",
      "id": "file-2NHwCUVOs7UeBuzTZHrSY3xS",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
  

Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172124/train.jsonl: file-O4shwdijLTqFzU8CcQqSORIn


2023-01-08 17:21:25.922 | DEBUG    | gptchem.tuner:tune:184 - Requested fine tuning. {
  "created_at": 1673194885,
  "events": [
    {
      "created_at": 1673194885,
      "level": "info",
      "message": "Created fine-tune: ft-qIPt4hAntK5iVBIR3Rw4ygfM",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-qIPt4hAntK5iVBIR3Rw4ygfM",
  "model": "babbage",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 1956,
      "created_at": 1673194885,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172124/train.jsonl",
      "id": "file-O4shwdijLTqFzU8CcQqSORIn",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded"

Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172328/train.jsonl: file-Ns3YupuwDJn8pUl5towxnUR0


2023-01-08 17:23:31.309 | DEBUG    | gptchem.tuner:tune:184 - Requested fine tuning. {
  "created_at": 1673195011,
  "events": [
    {
      "created_at": 1673195011,
      "level": "info",
      "message": "Created fine-tune: ft-yFQPauAeN1hoUzd9EWg0xCEx",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-yFQPauAeN1hoUzd9EWg0xCEx",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 2108,
      "created_at": 1673195010,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172328/train.jsonl",
      "id": "file-Ns3YupuwDJn8pUl5towxnUR0",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172736/train.jsonl: file-YgsoNx4ACYejLaAHzPrW4OxT


2023-01-08 17:27:37.628 | DEBUG    | gptchem.tuner:tune:184 - Requested fine tuning. {
  "created_at": 1673195257,
  "events": [
    {
      "created_at": 1673195257,
      "level": "info",
      "message": "Created fine-tune: ft-qzrpvfLsEIT0UnHGitxZqB4v",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-qzrpvfLsEIT0UnHGitxZqB4v",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-TFRJXw3PPQocOWbu71eI2t9U",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 1982,
      "created_at": 1673195257,
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_172736/train.jsonl",
      "id": "file-YgsoNx4ACYejLaAHzPrW4OxT",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded"

In [53]:
results

[{'base_model': 'ada',
  'batch_size': None,
  'n_epochs': 4,
  'learning_rate_multiplier': None,
  'run_name': None,
  'wandb_sync': False,
  'outdir': '/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_171919',
  'train_filename': '/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/01_tuning_parameter_influence/out/20230108_171919/train.jsonl',
  'valid_filename': 'None',
  'model_name': 'ada:ft-lsmoepfl-2023-01-08-16-21-00',
  'ft_id': 'ft-vxHzECazaCxdMFRsHKOPd2Ew',
  'date': '20230108_172123',
  'train_file_id': 'file-2NHwCUVOs7UeBuzTZHrSY3xS',
  'valid_file_id': None,
  'accuracy': 0.5,
  'acc_macro': 0.5,
  'racc': 0.5,
  'kappa': 0.0,
  'confusion_matrix': pycm.ConfusionMatrix(classes: [0, 1]),
  'f1_macro': 0.3333333333333333,
  'f1_micro': 0.5,
  'frac_valid': 1.0,
  'all_y_true': (#10) [0,1,0,1,0,0,1,1,1,0],
  'all_y_pred': (#10) [0,0,0,0,0,0,0,0,0,0],
  'valid_indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
  'might_h