Script for ChemBERTa evaluation. Add the data path of the script and the dataset name for evaluation. 

In [1]:
from transformers import TrainingArguments, Trainer
import numpy as np
import torch
from sklearn import metrics
from transformers import EarlyStoppingCallback
import random
from transformers import set_seed
seed =14

In [None]:
data_path = '.../pr/ChemLM/data/benchmark' #add data path here
dataset = 'clintox'

In [2]:
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
random.seed(seed)
set_seed(seed)

In [3]:
def compute_metrics(eval_pred):
    # Unpack predictions and labels from eval_pred
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    auc = metrics.roc_auc_score(labels,predictions, average='macro')
    f1=metrics.f1_score(labels,predictions, average='macro')
    precision=metrics.precision_score(labels,predictions,average='macro')
    recall=metrics.recall_score(labels,predictions,average='macro')
    acc=metrics.accuracy_score(labels,predictions)

    f1_binary=metrics.f1_score(labels,predictions,pos_label=1, average='binary')
    precision_binary=metrics.precision_score(labels,predictions,pos_label=1,average='binary')
    recall_binary=metrics.recall_score(labels,predictions,pos_label=1, average='binary')


    # Return a dictionary of metrics
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'f1_pos': f1_binary,
        'auc': auc
    }

In [None]:
# Load model directly
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_180k")
model = RobertaForSequenceClassification.from_pretrained("seyonec/PubChem10M_SMILES_BPE_180k", num_labels=2)

# tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
# model = RobertaForSequenceClassification.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", num_labels=2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import pandas as pd
def load_data(path):
  df = pd.read_csv(path, sep = '\t')
  print(df)
  return df

if dataset == 'bace':
    train_data = load_data(f'{data_path}/train_bace_clf_stratified_0.2_clf.csv')
    valid_data = load_data(f'{data_path}/validation_bace_clf_stratified_0.2_clf.csv')
    test_data = load_data(f'{data_path}/test_bace_clf_stratified_0.2_clf.csv')
else:    
    train_data = load_data(f'{data_path}/train_{dataset}_stratified_0.2_clf.csv')
    valid_data = load_data(f'{data_path}/validation_{dataset}_stratified_0.2_clf.csv')
    test_data = load_data(f'{data_path}/test_{dataset}_stratified_0.2_clf.csv')



In [None]:
class FinetuneDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, include_labels=True):
        self.encodings = tokenizer(df["smiles"].tolist(), truncation=True, padding=True)
        self.labels = df.iloc[:, 1].values
        self.include_labels = include_labels

    def __getitem__(self, idx):
        #item = {"input_ids": self.encodings["input_ids"]}
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.include_labels and self.labels is not None:
            item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

    def check(self):
        item = {key: torch.tensor(val[0]) for key, val in self.encodings.items()}
        if self.include_labels and self.labels is not None:
            item["labels"] = torch.tensor(int(self.labels[0]))
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = FinetuneDataset(train_data, tokenizer)
valid_dataset = FinetuneDataset(valid_data, tokenizer)
test_dataset = FinetuneDataset(test_data, tokenizer)

In [9]:
train_dataset = FinetuneDataset(train_data, tokenizer)
valid_dataset = FinetuneDataset(valid_data, tokenizer)
test_dataset = FinetuneDataset(test_data, tokenizer)

In [None]:
training_args = TrainingArguments(
        evaluation_strategy='epoch',
        learning_rate= 5e-4,
        save_strategy = 'epoch',
        eval_steps = 1000,
        logging_steps=100,
        #save_steps=10000,
        load_best_model_at_end=True,
        output_dir='/content/',
        overwrite_output_dir=True,
        num_train_epochs= 100,
        per_device_train_batch_size=64,
        per_device_eval_batch_size= 64,
        fp16=torch.cuda.is_available(),  # fp16 only works on CUDA devices
        report_to="none"  # Disables W&B logging
    )


In [None]:
trainer = Trainer(
            model,
            training_args,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

In [None]:
!wandb disabled
import os
os.environ["WANDB_MODE"] = "disabled"

trainer.train()
trainer.save_model(f'chemberta_{dataset}_f_check')
# Evaluate on the test set
test_results = trainer.predict(test_dataset)
print(test_results.metrics)

In [None]:
print('ChemBERTa & {} & {} & {} & {} & {}'.format(round(test_results.metrics['test_f1'],2), round(test_results.metrics['test_auc'],2), round(test_results.metrics['test_precision'],2), round(test_results.metrics['test_recall'],2), round(test_results.metrics['test_accuracy'],2)) )