In [1]:
from sklearn.metrics import average_precision_score, roc_auc_score
import wandb

from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, IntervalStrategy

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F


Loading the metrics we'll use to evaluate our model's training

In [2]:
auc = evaluate.load("roc_auc")
accuracy = evaluate.load("accuracy")
metric = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Here we decide with evaluation to use and with dataset to test on

In addition, we load the pre-trained model here

In [3]:
split_type = 'db_agree_no_dups'
dataset_name = 'DrugBank'
pretrained_path = "seyonec/PubChem10M_SMILES_BPE_450k"

here we load the dataset, we use train2 since it's the train file that doesn't contain the validation set insode of it.

this ``load_dataset`` method, automatically loads all the files in csv format and creates an HuggingFace's dataset object that is easy to use when fine-tuning models

In [4]:
dataset = load_dataset('csv', data_files={'train': f'split/{split_type}/{dataset_name}/train2.csv',
                                          'validation': f'split/{split_type}/{dataset_name}/val.csv',
                                          'test': f'split/{split_type}/{dataset_name}/test.csv'})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

removing uncessencary columns from the dataset

In [5]:
dataset = dataset.rename_column('withdrawn_class', 'labels').\
            remove_columns(['Unnamed: 0', 'index', 'length', 'inchikey', 'groups', 'source']).\
            with_format('torch')

here we load our model and tokenizer, we use the ``AutoModel`` and ``AutoTokenizer`` classes as they provide a generic way to load every model in HuggingFace

In [6]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_path, num_labels=2,
                                                           id2label={0: 'Not Withdrawn', 1:'Withdrawn'},
                                                           label2id={'Not Withdrawn': 0, 'Withdrawn': 1})

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [8]:
def tokenize_function(examples):
    """this methods tokenize the smiles into ids which are then fed into the transforemr model
    we set the max length of the toknizer to be the longest SMILES in our dataset and pad the rest to this length"""
    return tokenizer(examples["smiles"], padding="max_length", truncation=True, max_length=300)

In [9]:
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3198 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1982 [00:00<?, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['smiles', 'name', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3198
    })
    validation: Dataset({
        features: ['smiles', 'name', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['smiles', 'name', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1982
    })
})

a method to compute all the metrics we are using to evaluate our models

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    auc_score = auc.compute(prediction_scores=logits[:, 1], references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    aupr = average_precision_score(y_score=logits[:, 1], y_true=labels)
    precision_score = precision.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    f1_score['F1'] = f1_score.pop('f1')
    return {**f1_score , **{'PR-AUC': aupr}, **accuracy_score, **auc_score, **precision_score, **recall_score}

here we define our entire training arguments
this is a simple HuggingFace object that will contain all the parameters we are using in our training

In [12]:
training_args = TrainingArguments(
    output_dir=f"./results/{split_type}/{dataset_name}/{pretrained_path}",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    save_strategy=IntervalStrategy.STEPS,
    report_to='wandb',
    run_name=f'{pretrained_path} {split_type} {dataset_name}',
    logging_steps=50,
    save_steps=50,
)

training our model

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset={'Validation': dataset["validation"], 'Test': dataset["test"]},
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112387612875965, max=1.0…

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Validation F1,Validation Pr-auc,Validation Accuracy,Validation Roc Auc,Validation Precision,Validation Recall,Test Loss,Test F1,Test Pr-auc,Test Accuracy,Test Roc Auc,Test Precision,Test Recall
50,0.609,No log,0.55666,0.711729,0.72125,0.744522,0.703518,0.460526,1.336469,0.199472,0.149737,0.388496,0.687427,0.112018,0.909639
100,0.5596,No log,0.541935,0.735696,0.73375,0.774671,0.782609,0.414474,1.702036,0.203136,0.203492,0.410192,0.699077,0.114527,0.89759
150,0.5121,No log,0.556034,0.74872,0.7425,0.781655,0.80625,0.424342,1.840887,0.212251,0.253607,0.441978,0.717339,0.120355,0.89759
200,0.5179,No log,0.574949,0.75762,0.74125,0.792737,0.765027,0.460526,1.916616,0.211731,0.265939,0.443996,0.721366,0.12013,0.891566
250,0.4804,No log,0.575053,0.7624,0.74875,0.796616,0.804734,0.447368,2.045796,0.206848,0.26005,0.427346,0.716476,0.116996,0.891566
300,0.4648,No log,0.581673,0.763162,0.7375,0.798182,0.737374,0.480263,2.118418,0.203114,0.267199,0.406155,0.71979,0.114416,0.903614




TrainOutput(global_step=300, training_loss=0.5239772542317709, metrics={'train_runtime': 151.6637, 'train_samples_per_second': 63.258, 'train_steps_per_second': 1.978, 'total_flos': 744663411741600.0, 'train_loss': 0.5239772542317709, 'epoch': 3.0})