# Preparing the data for DNA+P

In [8]:
import pandas as pd
import numpy as np

In [9]:
from datasets import load_dataset

data_files = dict(
    train="./promoter_detection/train.csv",
    test="./promoter_detection/test.csv",
    val="./promoter_detection/dev.csv"
)

promoter_dataset = load_dataset("csv", data_files=data_files)

promoter_dataset

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset csv (/home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 760.34it/s]


DatasetDict({
    train: Dataset({
        features: ['sequence', 'label'],
        num_rows: 47356
    })
    test: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
    val: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
})

In [10]:
train_dataset = promoter_dataset["train"]
val_dataset = promoter_dataset["val"]
test_dataset = promoter_dataset["test"]

In [11]:
seq = train_dataset['sequence'][0]

In [12]:
from Bio.Seq import Seq

def dnap_tokenize(seq: str, stop_symbol: str = "*"):
    # find boundaries of the coding sequence
    start = seq.lower().find("atg")
    stop = seq.lower().find("tga")
    print(f"start: {start}")
    print(f"stop: {stop}")
    protein_seq = str(Seq(seq[start:stop]).translate())
    return seq[0:start-1] + protein_seq + stop_symbol + seq[stop+3:]

In [13]:
dnap_tokenize(seq)

start: 16
stop: 17




'TATAATAATAACGAA*GACGACAGTCGACAAGAAAAGCACCAGCTGTCCCCGCCACATACAAGTATATGAGAAGGGGACGCGGGAGAGCGCCGCGGGGGACCGACGCGCTATTGAGGGGGATGGGTACAAGCGGGGCGGGGAGGCCGGAGCTTTATCCAGGCCAATGAATGGCCACTTGCGATGCCCAATTGCACCAAGCTTGGAGCGCACACTCAACCCCTTCCCCAGCGGTATGCCAAAATTCACCGTCTGAATGGCGTTGGTGCAGGTCGGTACAGAGCTCTCCTGCGCCGAG'

A naive tokenizer like this does not account for ORF-sensitivity and will simply find the first occurence of the stop codon, even if it follows the start codon right away. All sequences such as `atga` would then be truncated to a stop codon with Biopython translation API. So we need something smarter that chunks up the sequences into codons once the ORF is found. This is a good time to develop or pick up an ORF finder.

In [14]:
!pip install orffinder

8518.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
from Bio.SeqIO import SeqRecord
from orffinder import orffinder

orfs = orffinder.getORFs(SeqRecord(Seq(train_dataset['sequence'][0])), minimum_length=3)
print(orfs)

[{'start': 17, 'end': 173, 'frame': 2, 'sense': '+', 'length': 156, 'trailing': False, 'index': 1}, {'start': 173, 'end': 300, 'frame': 2, 'sense': '+', 'length': 127, 'trailing': True, 'index': 2}, {'start': 186, 'end': 300, 'frame': 3, 'sense': '+', 'length': 114, 'trailing': True, 'index': 3}, {'start': 169, 'end': 259, 'frame': 1, 'sense': '+', 'length': 90, 'trailing': False, 'index': 4}, {'start': 63, 'end': 1, 'frame': 2, 'sense': '-', 'length': 61, 'trailing': True, 'index': 5}, {'start': 259, 'end': 300, 'frame': 1, 'sense': '+', 'length': 41, 'trailing': True, 'index': 6}]


In [16]:
orf = orfs[0]

In [17]:
orf

{'start': 17,
 'end': 173,
 'frame': 2,
 'sense': '+',
 'length': 156,
 'trailing': False,
 'index': 1}

In [18]:
from typing import Dict, Any

def dnap_tokenize(seq: str, orf_data: Dict[str, Any], stop_symbol: str = "*"):
    start = orf_data['start']
    stop = orf_data['end']
    protein_seq = str(Seq(seq[start:stop]).translate())
    return seq[0:start-1] + protein_seq + stop_symbol + seq[stop+3:]

In [19]:
dnap_tokenize(seq, orf)

'TATAATAATAACGAAG*DDSRQEKHQLSPPHTSI*EGDAGERRGGPTRY*GGWVQAGRGGRSFIQANE*CCACTTGCGATGCCCAATTGCACCAAGCTTGGAGCGCACACTCAACCCCTTCCCCAGCGGTATGCCAAAATTCACCGTCTGAATGGCGTTGGTGCAGGTCGGTACAGAGCTCTCCTGCGCCGAG'

That's cool and all but these are all promoter sequences, so within those 300 bp here we would not expect any actual genes to be found.

What we could do instead is append those sequences to existing gene sequences and encode specifically those gene sequences only using a DNA+P representation. We would be using synthetic data but it would be much easier to prepare than looking for specific genes in databases that are prepended by those promoters that Prom300 contains.

In [20]:
seq

'TATAATAATAACGAAGATGAGACGACAGTCGACAAGAAAAGCACCAGCTGTCCCCGCCACATACAAGTATATGAGAAGGGGACGCGGGAGAGCGCCGCGGGGGACCGACGCGCTATTGAGGGGGATGGGTACAAGCGGGGCGGGGAGGCCGGAGCTTTATCCAGGCCAATGAATGGCCACTTGCGATGCCCAATTGCACCAAGCTTGGAGCGCACACTCAACCCCTTCCCCAGCGGTATGCCAAAATTCACCGTCTGAATGGCGTTGGTGCAGGTCGGTACAGAGCTCTCCTGCGCCGAG'

Let's develop a tiny function that will append randomly generated gene sequences to the sequences in the dataset that are marked as promoters:

In [21]:
from datasets import DatasetDict
from typing import Dict, Any, List
import Bio.Data.CodonTable as CodonTable


def append_gene_to_promoter(seq: str,
                            gene_length: int,
                            is_promoter_seq: bool = True,
                            seed: int = 42,
                            codon_table: CodonTable.CodonTable = CodonTable.standard_dna_table):
    np.random.seed(seed)
    num_codons = gene_length - 6 // 3  # -6 because we account for 1 start and 1 stop codon per gene appended
    actual_gene_length = num_codons * 3  # recover if `gene_length` was not divisible by 3
    if is_promoter_seq:
        # 1. Generate a random in-ORF gene of length `gene_length`:
        start = codon_table.start_codons[0]  # ATG
        stop = np.random.choice(codon_table.stop_codons)  # randomly choose one
        codon_seq = np.random.choice(list(codon_table.forward_table.keys()),
                                     size=num_codons,
                                     replace=True)
        codon_seq = "".join(list(codon_seq))
        # 2. Append the gene to the promoter sequence if the sequence is marked as a promoter:
        return seq + start + codon_seq + stop
    else:
        # 1. Generate a random DNA sequence that does not start with an `ATG` codon
        nucleotides = ["A", "T", "G", "C"]
        rand_nucl_seq = np.random.choice(nucleotides,
                                         size=actual_gene_length,
                                         replace=True)
        # 2. Append the padding sequence to the non-promoter sequence
        rand_nucl_seq = "".join(list(rand_nucl_seq))
        return seq + rand_nucl_seq


def append_gene_to_dataset_record(seq: str, label: str, gene_length: int, seed: int = 42):
    match label:
        case 0:
            return dict(sequence=append_gene_to_promoter(seq, gene_length, False, seed), label=label)
        case 1:
            return dict(sequence=append_gene_to_promoter(seq, gene_length, True, seed), label=label)
        case _:
            raise ValueError(f"{label} is not a valid label for a binary classification task")


def append_gene_to_dataset_batch(batch: Dict[str, List[Any]], gene_length: int, seed: int = 42):
    seqs = batch["sequence"]
    labels = batch["label"]
    out = dict(sequence=[], label=[])
    for seq, label in zip(seqs, labels):
        updated_record = append_gene_to_dataset_record(seq, label, gene_length, seed)
        out["sequence"].append(updated_record["sequence"])
        out["label"].append(updated_record["label"])
    return out


def append_genes_to_dataset(dataset_collection: DatasetDict, gene_length: int, seed: int = 42):
    new_dataset_collection = DatasetDict()
    for k in dataset_collection.keys():
        # `batched=True` speeds things up by processing sequences in batches
        new_dataset_collection[k] = dataset_collection[k].map(lambda batch: append_gene_to_dataset_batch(batch,
                                                                                                         gene_length,
                                                                                                         seed),
                                                              batched=True)
    return new_dataset_collection


append_gene_to_promoter(seq, 30, True)
append_gene_to_promoter(seq, 30, False)

'TATAATAATAACGAAGATGAGACGACAGTCGACAAGAAAAGCACCAGCTGTCCCCGCCACATACAAGTATATGAGAAGGGGACGCGGGAGAGCGCCGCGGGGGACCGACGCGCTATTGAGGGGGATGGGTACAAGCGGGGCGGGGAGGCCGGAGCTTTATCCAGGCCAATGAATGGCCACTTGCGATGCCCAATTGCACCAAGCTTGGAGCGCACACTCAACCCCTTCCCCAGCGGTATGCCAAAATTCACCGTCTGAATGGCGTTGGTGCAGGTCGGTACAGAGCTCTCCTGCGCCGAGGCAGGCAAGTGGGGCACCCGTATCCTTTCCAACTTACAAGGGTCCCCGTTGTGCGCCAGAGGAAGTCACTTTATATCCGCGCAC'

In [22]:
promoter_dataset_with_genes_appended = append_genes_to_dataset(promoter_dataset, 30)

Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-2e209b8db1408bd3.arrow
Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-14fa5a68aaa0ef51.arrow
Loading cached processed dataset at /home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-286709a65c82b55a.arrow


In [23]:
promoter_dataset_with_genes_appended

DatasetDict({
    train: Dataset({
        features: ['sequence', 'label'],
        num_rows: 47356
    })
    test: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
    val: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
})

Note: because each sequence in the Prom300 dataset was exactly 300bp-long, we now know that ORFs for those samples that _were_ marked as actual promoters are in fact fake genes that we appended. We can **prepare a tokenizer for DNA+P representation with this in mind** considering only the protein-encoding part (over the 300bp boundary) as our starting point for the ORF. So we need to consider two things for the tokenizer:
1. Start translating the ORF at 301bp mark (that will be string idx `300`).
2. Only the sequences labeled as containing promoters should have the tokenizer applied to them.

**Very important**: for the test set the DNA+P tokenizer **must not** convert the sequences to DNA+P representation. They should stay in their original form so that we can check whether _the model_ learned to pick up on those promoter sequences better than the baseline.

In [52]:
from copy import deepcopy
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    NormalizedString,
    PreTokenizedString
)
from itertools import accumulate, islice
from dataclasses import dataclass
from typing import Iterable, Literal, Optional
from Bio.Data.CodonTable import standard_dna_table

protein_alphabet_map = {
    "A": "Ala",
    "C": "Cys",
    "D": "Asp",
    "E": "Glu",
    "F": "Phe",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "K": "Lys",
    "L": "Leu",
    "M": "Met",
    "N": "Asp",
    "P": "Pro",
    "Q": "Gln",
    "R": "Arg",
    "S": "Ser",
    "T": "Thr",
    "V": "Val",
    "W": "Trp",
    "Y": "Tyr",
    "*": "*"
}


class Uppercase:
    def __init__(self) -> None:
        super().__init__()

    def normalize(self, normalized: NormalizedString):
        return normalized.uppercase()

    def normalize_str(self, sequence: str):
        return sequence.upper()


def take_n(iterable: Iterable, n: int):
    iterator = iter(iterable)
    while True:
        chunk = "".join(list(islice(iterator, n)))
        if chunk is None:
            break
        yield chunk

@dataclass
class DNAPConfig:
    start_translating_from_idx: int
    _length: int = -1

    @property
    def length(self):
        # if self._length == -1:
            # return -1
        return self._length


    def translate(self, seq: str):
        return dnap_tokenize(seq, dict(start=self.start_translating_from_idx,
                                       end=self.start_translating_from_idx + self.length))


def _wrap_in_angle_brackets(seq: str, wrap_tokens: bool = True):
    return seq if not wrap_tokens else "<" + seq + ">"


def _pre_tokenize(sequence: str,
                  k: int,
                  include_cls: bool,
                  split_on_unknown_base: bool,
                  wrap_tokens: bool,
                  include_ranges: bool,
                  representation: Literal["dna", "dnap"],
                  dnap_config: Optional[DNAPConfig] = None):
    
    def _with_ranges(seq: str, idx_start: int, idx_end: int):
        subslice = seq
        if include_ranges:
            subslice = (seq, idx_start, idx_end)
        return subslice

    def _dnap_chunk_tokenize(seq: str, stop_symbol: str = "*"):
        out = []
        n = 3
        chunks = [seq[i:i+n] for i in range(0, len(seq), n)]
        lookup_table = deepcopy(standard_dna_table.forward_table)
        lookup_table.update({k: stop_symbol for k in standard_dna_table.stop_codons})
        # Replace single-letter identifiers with the 3-letter ones:
        three_letter_lookup_table = {k: protein_alphabet_map[v] for k, v in lookup_table.items()}
        for codon in chunks:
            try:
                aa = three_letter_lookup_table[codon]
                out.append(aa)
            except KeyError:
                out.append(codon)
        return "".join(out)

    if dnap_config is None and representation == "dnap":
        raise ValueError("Missing DNAP representation config!")

    i = 0
    j = k
    n = len(str(sequence))
    slices = [] if not include_cls else ["<CLS>"]
    # for seq_slice in take_n(sequence, k):
    while j <= n:
        seq_slice = sequence[i:j]
        if "N" in seq_slice and split_on_unknown_base:
            idx_of_n = seq_slice.find("N")
            l = 0
            subslices = []
            while l != idx_of_n:
                subslice = _with_ranges(_wrap_in_angle_brackets(seq_slice[l], wrap_tokens), l, l + 1)
                subslices.append(subslice)  # split preceding into single bases
                l += 1
            subslice = _with_ranges(_wrap_in_angle_brackets(seq_slice[idx_of_n], wrap_tokens), idx_of_n, idx_of_n + 1)
            subslices.append(subslice)  # add `N` itself
            slices.extend(subslices)  # add the split-up slices to the main list
            i = i + k - idx_of_n - 1
            j = j + k - idx_of_n - 1
        else:
            if representation == "dnap" and i >= dnap_config.start_translating_from_idx:
                slice_ = _with_ranges(_wrap_in_angle_brackets(_dnap_chunk_tokenize(seq_slice), wrap_tokens), i, j)
            else:
                slice_ = _with_ranges(_wrap_in_angle_brackets(seq_slice, wrap_tokens), i, j)
            slices.append(slice_)
            i += k
            j += k
    return slices


# We use identical tokenization strategy to Nucleotide Transformer
class KMerDNAPreTokenizer:

    def __init__(self,
                 k: int = 6,
                 include_cls: bool = True,
                 wrap_tokens: bool = True,
                 split_on_unknown_base: bool = True,
                 include_ranges: bool = True) -> None:
        """Args:
            `k` (int): how many bases per token
            `include_cls` (bool): whether to include the starting `<CLS>` token or not
            `wrap_tokens` (bool): whether to wrap each token in `<>` angle brackets
            `split_on_unknown_base` (bool): if `True` then each `N` in the sequence will
                                            be treated as a separate token and all preceding
                                            bases that don't align to `k` are split up as
                                            separate tokens
            `include_ranges` (bool): whether token slice ranges should be returned in tuples
        """
        self.k = k
        self.include_cls = include_cls
        self.wrap_tokens = wrap_tokens
        self.split_on_unknown_base = split_on_unknown_base
        self.include_ranges = include_ranges
        super().__init__()


    def pre_tokenize(self, pretok: PreTokenizedString):
        # Note: `pre_tokenize` acts in-place on `pretok`, so we should not return anything here
        pretok.split(lambda idx, x: _pre_tokenize(x,
                                                  self.k,
                                                  self.include_cls,
                                                  self.split_on_unknown_base,
                                                  self.wrap_tokens,
                                                  self.include_ranges,
                                                  representation="dna"))

    def pre_tokenize_str(self, sequence: str):
        return _pre_tokenize(sequence,
                             self.k,
                             self.include_cls,
                             self.split_on_unknown_base,
                             self.wrap_tokens,
                             self.include_ranges,
                             representation="dna")


class KMerDNAPPreTokenizer(KMerDNAPreTokenizer):

    def __init__(self,
                 k: int = 6,
                 include_cls: bool = True,
                 wrap_tokens: bool = True,
                 split_on_unknown_base: bool = True,
                 include_ranges: bool = True,
                 start_idx: int = 301) -> None:
        self.start_idx = start_idx
        super().__init__(k,
                         include_cls,
                         wrap_tokens,
                         split_on_unknown_base,
                         include_ranges)

    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(lambda idx, x: _pre_tokenize(x,
                                                  self.k,
                                                  self.include_cls,
                                                  self.split_on_unknown_base,
                                                  self.wrap_tokens,
                                                  self.include_ranges,
                                                  representation="dnap",
                                                  dnap_config=DNAPConfig(self.start_idx)))

    def pre_tokenize_str(self, sequence: str):
        return _pre_tokenize(sequence,
                             self.k,
                             self.include_cls,
                             self.split_on_unknown_base,
                             self.wrap_tokens,
                             self.include_ranges,
                             representation="dnap",
                             dnap_config=DNAPConfig(self.start_idx))


# normalizer = normalizers.Lowercase()
# normalizer = normalizers.Normalizer.custom(Uppercase())
# pre_tokenizer = pre_tokenizers.PreTokenizer.custom(KMerDNAPreTokenizer(6))
# tokenizer = Tokenizer()
# normalizer = Uppercase()
pre_tokenizer = KMerDNAPreTokenizer(6, include_ranges=False)

# we're using the same example as the Nucleotide Transformer to test:
original = "ACGTGTACNTGCACGGANCGACTAGTCTGA"
actual = pre_tokenizer.pre_tokenize_str(original)
expected = ["<CLS>","<ACGTGT>","<A>","<C>","<N>","<TGCACG>","<G>","<A>","<N>","<CGACTA>","<GTCTGA>"]
print(actual)
print(expected)

# compare the two lists:
print(all(accumulate(map(lambda x: x[0] == x[1], zip(actual, expected, strict=True)), lambda x, y: x and y)))

# then instantiate the other pre-tokenizer:
pre_tokenizer = KMerDNAPPreTokenizer(6, start_idx=30)
original = "ACGTGTACNTGCACGGANCGACTAGTCTGAATGGATGATGATGATTGA"
actual = pre_tokenizer.pre_tokenize_str(original)
expected = ['<CLS>', ('<ACGTGT>', 0, 6), ('<A>', 0, 1), ('<C>', 1, 2), ('<N>', 2, 3), ('<TGCACG>', 9, 15), ('<G>', 0, 1), ('<A>', 1, 2), ('<N>', 2, 3), ('<CGACTA>', 18, 24), ('<GTCTGA>', 24, 30), ('<MD>', 30, 36), ('<DD>', 36, 42), ('<D*>', 42, 48)]
print(actual)
print(expected)
print(all(accumulate(map(lambda x: x[0] == x[1], zip(actual, expected, strict=True)), lambda x, y: x and y)))

['<CLS>', '<ACGTGT>', '<A>', '<C>', '<N>', '<TGCACG>', '<G>', '<A>', '<N>', '<CGACTA>', '<GTCTGA>']
['<CLS>', '<ACGTGT>', '<A>', '<C>', '<N>', '<TGCACG>', '<G>', '<A>', '<N>', '<CGACTA>', '<GTCTGA>']
True
['<CLS>', ('<ACGTGT>', 0, 6), ('<A>', 0, 1), ('<C>', 1, 2), ('<N>', 2, 3), ('<TGCACG>', 9, 15), ('<G>', 0, 1), ('<A>', 1, 2), ('<N>', 2, 3), ('<CGACTA>', 18, 24), ('<GTCTGA>', 24, 30), ('<MetAsp>', 30, 36), ('<AspAsp>', 36, 42), ('<Asp*>', 42, 48)]
['<CLS>', ('<ACGTGT>', 0, 6), ('<A>', 0, 1), ('<C>', 1, 2), ('<N>', 2, 3), ('<TGCACG>', 9, 15), ('<G>', 0, 1), ('<A>', 1, 2), ('<N>', 2, 3), ('<CGACTA>', 18, 24), ('<GTCTGA>', 24, 30), ('<MD>', 30, 36), ('<DD>', 36, 42), ('<D*>', 42, 48)]
False


In [53]:
len(original)

48

**Very important**: for the test set the DNA+P tokenizer **must not** convert the sequences to DNA+P representation. They should stay in their original form so that we can check whether _the model_ learned to pick up on those promoter sequences better than the baseline.

In [54]:
train_dataset = promoter_dataset_with_genes_appended["train"]
val_dataset = promoter_dataset_with_genes_appended["val"]
test_dataset = promoter_dataset_with_genes_appended["test"]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from tokenizers.models import Unigram
from pathlib import Path

vocab_ = ["A", "T", "G", "C"]
vocab_ += list(protein_alphabet_map.values())
vocab = list(zip(vocab_, [1 for i in range(len(vocab_))]))

pre_tokenizer = pre_tokenizers.PreTokenizer.custom(KMerDNAPreTokenizer(6))
tokenizer = Tokenizer(Unigram(vocab=vocab))
tokenizer.pre_tokenizer = pre_tokenizer

tokenizer.encode(original)

In [None]:
model_loader = lambda: AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=2)
model_save_dir = Path("results/model_nucleotide_transformer")
training_args = TrainingArguments(model_save_dir,
                                  evaluation_strategy="epoch")

In [48]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from pathlib import Path
from transformers import EvalPrediction, PreTrainedTokenizerFast
from tokenizers.pre_tokenizers import PreTokenizer
import evaluate

def make_metrics_func(*dataset_load_args):
    def compute_metrics(eval_pred: EvalPrediction):
        accuracy = evaluate.load("accuracy")
        logits, labels = eval_pred
        pred_class = np.argmax(logits, axis=-1)  # take the max-scoring logit as the predicted class ID
        return accuracy.compute(predictions=pred_class,
                                references=labels)
    return compute_metrics

compute_metrics = make_metrics_func()

model_save_dir = Path("results/model_nucleotide_transformer")
training_args = TrainingArguments(model_save_dir,
                                    evaluation_strategy="epoch")


def tokenize(dataset: Dataset):
    fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    fast_tokenizer._tokenizer.pre_tokenizer=PreTokenizer.custom(KMerDNAPreTokenizer(6))
    fast_tokenizer.save("tok.json")
    fast_tokenizer = PreTrainedTokenizerFast.from_file("tok.json")
    return fast_tokenizer(dataset["sequence"], padding=True)


def training_pipeline(tokenizer,
                      model,
                      training_args,
                      train_dataset,
                      val_dataset):

    tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
    tokenized_val_dataset = val_dataset.map(tokenize, batched=True)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [49]:
trainer = training_pipeline(tokenizer, model_loader(), training_args, train_dataset, val_dataset)
trainer.train()

loading configuration file config.json from cache at /home/chris/.cache/huggingface/hub/models--InstaDeepAI--nucleotide-transformer-500m-human-ref/snapshots/789373eb1adf0dda569e7fe55e4fc2adb15593bc/config.json
Model config EsmConfig {
  "_name_or_path": "InstaDeepAI/nucleotide-transformer-500m-human-ref",
  "architectures": [
    "EsmForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "emb_layer_norm_before": false,
  "esmfold_config": null,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1280,
  "initializer_range": 0.02,
  "intermediate_size": 5120,
  "is_folding_model": false,
  "layer_norm_eps": 1e-12,
  "mask_token_id": 2,
  "max_position_embeddings": 1002,
  "model_type": "esm",
  "num_attention_heads": 20,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tie_word_embeddings": false,
  "token_dropout": true,
  "torch_dtype": "float32",
  "transformers_version": "4.30.2",
  "use_cache": false,
  "vocab_list": null,
  "vocab_size": 4

Exception: Error while attempting to pickle Tokenizer: Custom PreTokenizer cannot be serialized