# Baselines

Using the Nucleotide Transformer pre-trained model and fine-tuning on the GUE dataset for promoter prediction.

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

In [2]:
df_train = pd.read_csv("./promoter_detection/train.csv", header=0)
df_val = pd.read_csv("./promoter_detection/dev.csv", header=0)
df_test = pd.read_csv("./promoter_detection/test.csv", header=0)

In [3]:
df_train_slice = df_train[:100]
df_test_slice = df_test[:100]

In [5]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=2)


Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref were not used when initializing EsmForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.dense.weight', 'classifier.

Defining the metrics:

In [4]:
from transformers import EvalPrediction
import evaluate

def make_metrics_func(*dataset_load_args):
    def compute_metrics(eval_pred: EvalPrediction):
        accuracy = evaluate.load("accuracy")
        logits, labels = eval_pred
        pred_class = np.argmax(logits, axis=-1)  # take the max-scoring logit as the predicted class ID
        return accuracy.compute(predictions=pred_class,
                                references=labels)
    return compute_metrics

compute_metrics = make_metrics_func()

  from .autonotebook import tqdm as notebook_tqdm
2023-07-09 15:33:43.168607: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-09 15:33:43.197066: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Creating the dataset objects for the HuggingFace Transformers API:

In [5]:
from datasets import load_dataset

data_files = dict(
    train="./promoter_detection/train.csv",
    test="./promoter_detection/test.csv",
    val="./promoter_detection/dev.csv"
)

promoter_dataset = load_dataset("csv", data_files=data_files)

promoter_dataset

Found cached dataset csv (/home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 1547.33it/s]


DatasetDict({
    train: Dataset({
        features: ['sequence', 'label'],
        num_rows: 47356
    })
    test: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
    val: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
})

In [6]:
train_dataset = promoter_dataset["train"]
val_dataset = promoter_dataset["val"]
test_dataset = promoter_dataset["test"]

Fine-tuning the Nucleotide Transformer model:

In [14]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from pathlib import Path

model_save_dir = Path("results/model_nucleotide_transformer")
training_args = TrainingArguments(model_save_dir,
                                    evaluation_strategy="epoch")


def tokenize(dataset: Dataset):
    return tokenizer(dataset["sequence"], padding=True)


tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-c03578642010e454.arrow
Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-0b114df6c15db2ec.arrow


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=17760, training_loss=0.10641782818076846, metrics={'train_runtime': 2045.449, 'train_samples_per_second': 69.456, 'train_steps_per_second': 8.683, 'total_flos': 2.0601930777201144e+16, 'train_loss': 0.10641782818076846, 'epoch': 3.0})

Save the fine-tuned model:

In [21]:
trainer.save_model("nucleotide_transformer_prom300")

**At this point you might need to restart the kernel to clear CUDA memory and load the fine-tuned model back again**

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForSequenceClassification.from_pretrained("./nucleotide_transformer_prom300", num_labels=2)


Send the model to the GPU:

In [8]:
model = model.to("cuda")

Check how much memory you have left on the GPU after loading the model:

In [9]:
!nvidia-smi --query-gpu=memory.used --format=csv

/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
memory.used [MiB]
2504 MiB


The Nucleotide Transformer model is about 2.3 GB.

Evaluate the test dataset and collect metrics:

In [27]:
import evaluate
from typing import Callable, Tuple, Union
import numpy as np
from sklearn.metrics import accuracy_score

tokens_ids = tokenizer(test_dataset["sequence"], return_tensors="pt")["input_ids"]

# Splitting the test set into batches to avoid OOM errors with my lovely RTX 4080:
# 5920 / 16 = 370

split_into = 4

Metric = Callable[[torch.Tensor | np.ndarray, torch.Tensor | np.ndarray], torch.Tensor | np.ndarray]

def _eval(test_batch: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: bool):
    with torch.no_grad():
        torch_outs = model(
            test_batch,
            attention_mask=attention_mask,
            output_hidden_states=output_hidden_states
        )
    return torch_outs


def test_and_calculate_metrics(tokens_ids: torch.Tensor,
                               labels: torch.Tensor,
                               model: nn.Module,
                               metric: Metric,
                               split_into: int = 4,
                               output_hidden_states: bool = False):
    metric_vals = []

    slice_size = tokens_ids.shape[0] // split_into
    for test_batch, batch_labels in zip(tokens_ids.split(slice_size),
                                        torch.tensor(labels).split(slice_size)):

        # Compute the embeddings:
        attention_mask = test_batch != tokenizer.pad_token_id

        # Send tokens and attention mask to the GPU:
        test_batch = test_batch.to("cuda")
        attention_mask = attention_mask.to("cuda")

        # Model outputs:
        torch_outs = _eval(test_batch,
                           attention_mask,
                           output_hidden_states)
        
        y_hat_prob = nn.Sigmoid()(torch_outs.logits)
        y_hat = torch.argmax(y_hat_prob, axis=-1)
        metric_value = metric(batch_labels.to("cpu").detach().numpy(), y_hat.to("cpu").detach().numpy())
        metric_vals.append(metric_value)

    return metric_vals

metric = accuracy_score

accs = test_and_calculate_metrics(tokens_ids,
                                  test_dataset["label"],
                                  model,
                                  metric,
                                  split_into)
avg_acc = np.mean(accs)

print(f"Accuracies per batch: {accs}")
print(f"Average accuracy: {avg_acc}")

Accuracies per batch: [0.8716216216216216, 0.9128378378378378, 0.9175675675675675, 0.9101351351351351]
Average accuracy: 0.9030405405405405


## DNABERT

We now attempt to fine-tune DNABERT on the same task with the same dataset. **You might need to reload your kernel before you start** because your GPU memory could be overloaded.

In [28]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from pathlib import Path

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)

model_save_dir = Path("results/model_dnabert")
training_args = TrainingArguments(model_save_dir,
                                    evaluation_strategy="epoch")


def tokenize(dataset: Dataset):
    return tokenizer(dataset["sequence"], padding=True)


tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize, batched=True)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing DNABertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DNABertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/

TrainOutput(global_step=17760, training_loss=0.7012527543145257, metrics={'train_runtime': 546.339, 'train_samples_per_second': 260.036, 'train_steps_per_second': 32.507, 'total_flos': 219021453591120.0, 'train_loss': 0.7012527543145257, 'epoch': 3.0})

In [29]:
trainer.save_model("dnabert_prom300")

**You might want to restart the kernel and load the model again to save GPU memory before proceeding**. Loading the model back from the saved state:

In [30]:
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("dnabert_prom300", trust_remote_code=True)

Send model to the GPU:

In [32]:
model = model.to("cuda")

In [33]:
tokens_ids = tokenizer(test_dataset["sequence"], return_tensors="pt")["input_ids"]

# Splitting the test set into batches to avoid OOM errors with my lovely RTX 4080:
# 5920 / 16 = 370

split_into = 4

metric = accuracy_score

accs = test_and_calculate_metrics(tokens_ids,
                                  test_dataset["label"],
                                  model,
                                  metric,
                                  split_into)
avg_acc = np.mean(accs)

print(f"Accuracies per batch: {accs}")
print(f"Average accuracy: {avg_acc}")

Accuracies per batch: [0.5047297297297297, 0.4918918918918919, 0.5162162162162162, 0.5006756756756757]
Average accuracy: 0.5033783783783784


In [34]:
metric = accuracy_score

accs = test_and_calculate_metrics(tokens_ids,
                                  test_dataset["label"],
                                  model,
                                  metric,
                                  split_into)
avg_acc = np.mean(accs)

print(f"Accuracies per batch: {accs}")
print(f"Average accuracy: {avg_acc}")

Accuracies per batch: [0.5047297297297297, 0.4918918918918919, 0.5162162162162162, 0.5006756756756757]
Average accuracy: 0.5033783783783784
