In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from gptchem.data import get_photoswitch_data
from gptchem.evaluator import evaluate_classification
from gptchem.gpt_classifier import DifficultNGramClassifier, NGramGPTClassifier
from gptchem.tuner import Tuner

In [3]:
data = get_photoswitch_data()
data = data.dropna(subset=["E isomer pi-pi* wavelength in nm", "name"])

In [6]:
data["binned"] = pd.qcut(data["E isomer pi-pi* wavelength in nm"], 2, labels=np.arange(2))

In [7]:
classifier = NGramGPTClassifier(
    property_name="transition wavelength",
    tuner=Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False),
)

In [16]:
train, test = train_test_split(data, train_size=50, random_state=0, stratify=data["binned"])

In [17]:
classifier.fit(train["SMILES"].values, train["binned"].values)

Upload progress: 100%|██████████| 11.9k/11.9k [00:00<00:00, 21.3Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/13_gpt_ngram/out/20230331_154659/train.jsonl: file-PV5sorRxXHzm7XeMbJ25miIv


In [18]:
predictions = classifier.predict(test["SMILES"].values)

In [19]:
cm = evaluate_classification(test["binned"].values, predictions)

In [20]:
cm

{'accuracy': 0.7368421052631579,
 'acc_macro': 0.7368421052631579,
 'racc': 0.5,
 'kappa': 0.4736842105263157,
 'confusion_matrix': pycm.ConfusionMatrix(classes: [0, 1]),
 'f1_macro': 0.7368421052631579,
 'f1_micro': 0.7368421052631579,
 'frac_valid': 1.0,
 'all_y_true': (#38) [1,1,1,0,0,0,0,0,0,1...],
 'all_y_pred': (#38) [1,1,1,0,0,0,0,1,0,1...],
 'valid_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37],
 'might_have_rounded_floats': False}

In [23]:
classifier_2 = DifficultNGramClassifier(
    property_name="transition wavelength",
    tuner=Tuner(n_epochs=8, learning_rate_multiplier=0.02, wandb_sync=False),
)

In [24]:
classifier_2.fit(train["SMILES"].values, train["binned"].values)

Upload progress: 100%|██████████| 10.2k/10.2k [00:00<00:00, 13.6Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/13_gpt_ngram/out/20230331_163745/train.jsonl: file-Ym6LW2mJNK3joNGWrV8Sr6o9


In [25]:
predictions_2 = classifier_2.predict(test["SMILES"].values)

In [26]:
cm_2 = evaluate_classification(test["binned"].values, predictions_2)

In [27]:
cm_2

{'accuracy': 0.7105263157894737,
 'acc_macro': 0.7105263157894737,
 'racc': 0.5,
 'kappa': 0.42105263157894735,
 'confusion_matrix': pycm.ConfusionMatrix(classes: [0, 1]),
 'f1_macro': 0.7054263565891472,
 'f1_micro': 0.7105263157894737,
 'frac_valid': 1.0,
 'all_y_true': (#38) [1,1,1,0,0,0,0,0,0,1...],
 'all_y_pred': (#38) [1,1,1,0,0,0,1,1,0,1...],
 'valid_indices': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37],
 'might_have_rounded_floats': False}