In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd 
from gptchem.gpt_classifier import GPTClassifier
from pycm import ConfusionMatrix
from utils import scaffold_split

In [4]:
df = pd.read_csv('tox21.csv')

In [5]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [7]:
subset = df.dropna(subset=['SR-ATAD5']).reset_index(drop=True)

In [13]:
subset['smiles'].values

array(['CCOc1ccc2nc(S(N)(=O)=O)sc2c1', 'CCN1C(=O)NC(c2ccccc2)C1=O',
       'CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C', ...,
       'C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C@@H]1CC[C@@H]2O',
       'C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C@H]2O',
       'COc1ccc2c(c1OC)CN1CCc3cc4c(cc3C1C2)OCO4'], dtype=object)

In [14]:
splits = scaffold_split(subset['smiles'].values, 0.2, 0.2)

7072
About to generate scaffolds
Generating scaffold 0/7072
Generating scaffold 1000/7072
Generating scaffold 2000/7072
Generating scaffold 3000/7072
Generating scaffold 4000/7072
Generating scaffold 5000/7072
Generating scaffold 6000/7072
Generating scaffold 7000/7072
About to sort in scaffold sets


In [17]:
train = subset.iloc[splits[0]]
valid = subset.iloc[splits[1]]
test = subset.iloc[splits[2]]

In [18]:
classifier = GPTClassifier(
    property_name='activity', 
    class_weights={0: 0.497, 1: 1 - 0.497}
)

In [20]:
from dotenv import load_dotenv
from deepchem.molnet import load_tox21
import numpy as np
from gptchem.tuner import Tuner
from gptchem.gpt_classifier import GPTClassifier
from gptchem.evaluator import evaluate_classification
from fastcore.xtras import save_pickle
from imblearn.under_sampling import RandomUnderSampler
import time
import os
import openai

In [61]:
seed = 42
name_mapping = {
    "NR-AR": "activity in the Androgen receptor, full length assay",
    "NR-AR-LBD": "activity in the Androgen receptor, ligand binding domain assay",
    "NR-AhR": "activity in the Aryl hydrocarbon receptor assay",
    "NR-Aromatase": "activity in the Aromatase assay",
    "NR-ER": "activity in the Estrogen receptor alpha, full length assay",
    "NR-ER-LBD": "activity in the Estrogen receptor alpha, LBD assay",
    "NR-PPAR-gamma": "activity in the PPAR-gamma receptor assay",
    "SR-ARE": "activity in the antioxidant responsive element assay",
    "SR-ATAD5": "activity in the ATPase Family AAA Domain Containing 5e assay",
}


target_number_mapping = {
    "NR-AR": 0,
    "NR-AR-LBD": 1,
    "NR-AhR": 2,
    "NR-Aromatase": 3,
    "NR-ER": 4,
    "NR-ER-LBD": 5,
    "NR-PPAR-gamma": 6,
    "SR-ARE": 7,
    "SR-ATAD5": 8,
}

target = 'SR-ATAD5'
num_train_points = 1000
num_test_points = 500
random_undersample = False
n_epochs = 8

In [62]:
tox21_tasks, tox21_datasets, transformers = load_tox21(seed=seed, reload=False)
train_dataset, valid_dataset, test_dataset = tox21_datasets

X_train, y_train = train_dataset.ids, train_dataset.y[:, target_number_mapping[target]]
X_test, y_test = test_dataset.ids, test_dataset.y[:, target_number_mapping[target]]

if num_train_points == "max":
    num_train_points = len(X_train)
if random_undersample:
    sampler = RandomUnderSampler(random_state=seed)

    X_train, y_train = sampler.fit_resample(X_train.reshape(-1, 1), y_train)

train_ids = np.random.choice(np.arange(len(X_train)), num_train_points, replace=False)
test_ids = np.random.choice(np.arange(len(X_test)), num_test_points, replace=False)

X_train = X_train[train_ids]
y_train = y_train[train_ids].astype(int)

X_test = X_test[test_ids]
y_test = y_test[test_ids].astype(int)
class_weights = {0: 0.497, 1: 1 - 0.497}
tuner = Tuner(n_epochs=n_epochs, learning_rate_multiplier=0.02, wandb_sync=False)
classifier = GPTClassifier(
    target,
    tuner=tuner,
    save_valid_file=True,
    querier_settings={"max_tokens": 10},
    class_weights=class_weights,
)

classifier.fit(X_train, y_train)

Upload progress: 100%|██████████| 92.9k/92.9k [00:00<00:00, 23.4Mit/s]


Uploaded file from /Users/kevinmaikjablonka/git/kjappelbaum/gptchem/experiments/03_classification/tox21/out/20231010_005612/train.jsonl: file-zaJ1F2rWNb2w0HTJ2fepOfOa


In [57]:
predictions = classifier.predict(X_test)

In [58]:
cm = ConfusionMatrix(actual_vector=y_test, predict_vector=list(predictions))

In [60]:
print(cm)

Predict    0          1          None       
Actual
0          439        33         5          

1          20         1          2          

None       0          0          0          





Overall Statistics : 

95% CI                                                            (0.85152,0.90848)
ACC Macro                                                         0.92
ARI                                                               0.03539
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      0.87711
Bennett S                                                         0.82
CBA                                                               0.31658
CSI                                                               None
Chi-Squared                                                       None
Chi-Squared DF                                     