In [1]:
%load_ext autoreload
%autoreload 2


In [31]:
from deepchem.molnet import load_tox21
import numpy as np
from gptchem.tuner import Tuner
from gptchem.gpt_classifier import GPTClassifier

from imblearn.under_sampling import RandomUnderSampler
from pubchempy import get_compounds

In [32]:
# get the name for a SMILES using pubchempy

def get_name(smiles):
    try:
        return get_compounds(smiles, 'smiles')[0].iupac_name
    except:
        return smiles

In [33]:
tox21_tasks, tox21_datasets, transformers = load_tox21(seed=21, reload=False)

In [39]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

X_train, y_train = train_dataset.ids, train_dataset.y[:, -1]
X_test, y_test = test_dataset.ids, test_dataset.y[:, -1]


In [34]:
train_names = [get_name(smiles) for smiles in X_train]
test_names = [get_name(smiles) for smiles in X_test]

In [26]:
X_train[0]

'C=C(C)C(=O)OCCCCCCCCCCCCC'

In [35]:
train_indices = np.random.choice(len(X_train), size=6000, replace=False)

In [36]:
y_train[train_indices].sum()

261.0

In [89]:
sampler = RandomUnderSampler(random_state=42)
X_train, y_train = sampler.fit_resample(X_train[train_indices].reshape(-1,1), y_train[train_indices].reshape(-1, 1))

In [41]:
# X_train = np.array(train_names)[train_indices]
# y_train = y_train[train_indices]

In [45]:
import pandas as pd

In [92]:
df_train = pd.DataFrame({'names': X_train.flatten(), 'label': y_train.flatten()})
df_train = df_train.dropna(subset=['names'])


df_test = pd.DataFrame({'names': X_test, 'label': y_test})
df_test = df_test.dropna(subset=['names'])

In [93]:
len(df_train)

521

In [94]:
classifier = GPTClassifier(
    'activity', 
 tuner=Tuner(n_epochs=4, learning_rate_multiplier=0.02)
)

In [95]:
X_train = X_train.flatten()

In [96]:
classifier.fit(df_train['names'].values, df_train['label'].astype(int).values)

Upload progress: 100%|██████████| 58.7k/58.7k [00:00<00:00, 102Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/tox21/out/20231006_083128/train.jsonl: file-7o5T63H0g3B6BcItMCTp23eH


ValueError: Fine tuning failed. Result: {
  "object": "fine-tune",
  "id": "ft-3SuVEwP4iTSBcdm0i0FBWe0l",
  "hyperparams": {
    "n_epochs": 4,
    "batch_size": null,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.02
  },
  "organization_id": "org-eRwftHUvPVthUUPGPm4T2j0U",
  "model": "ada",
  "training_files": [
    {
      "object": "file",
      "id": "file-7o5T63H0g3B6BcItMCTp23eH",
      "purpose": "fine-tune",
      "filename": "/Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/tox21/out/20231006_083128/train.jsonl",
      "bytes": 58712,
      "created_at": 1696573895,
      "status": "uploaded",
      "status_details": null
    }
  ],
  "validation_files": [],
  "result_files": [],
  "created_at": 1696573895,
  "updated_at": 1696573895,
  "status": "pending",
  "fine_tuned_model": null,
  "events": [
    {
      "object": "fine-tune-event",
      "level": "info",
      "message": "Created fine-tune: ft-3SuVEwP4iTSBcdm0i0FBWe0l",
      "created_at": 1696573895
    }
  ]
}.

In [76]:
classifier

<gptchem.gpt_classifier.GPTClassifier at 0x179919580>

In [85]:
test_indices = np.random.choice(len(X_test), size=200, replace=False)

In [86]:
preds = classifier.predict(df_test['names'].values[test_indices])

In [87]:
from pycm import ConfusionMatrix

In [88]:
print(ConfusionMatrix(actual_vector=df_test['label'].values[test_indices].astype(int), predict_vector=list(preds)))

Predict   0         1         
Actual
0         180       0         

1         20        0         





Overall Statistics : 

95% CI                                                            (0.85842,0.94158)
ACC Macro                                                         0.9
ARI                                                               0.0
AUNP                                                              0.5
AUNU                                                              0.5
Bangdiwala B                                                      0.9
Bennett S                                                         0.8
CBA                                                               0.45
CSI                                                               None
Chi-Squared                                                       None
Chi-Squared DF                                                    1
Conditional Entropy                                               -0.0
Cramer V       