In [None]:
!pip install datasets --quiet
!pip install evaluate --quiet
!pip install seqeval --quiet
! pip install -U accelerate --quiet
! pip install -U transformers --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import evaluate
from itertools import chain

from transformers import pipeline,AutoTokenizer,AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

from sklearn.model_selection import train_test_split

import torch
import datasets
from datasets import load_dataset

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('TalkFile_ner_2.csv.csv')
df.shape

(47959, 4)

In [None]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [None]:
df['Tag'] = df['Tag'].apply(lambda x: eval(x))
list_all_tag = df['Tag'].to_list()

In [None]:
list_labels = ['O'] + [i for i in list(set(chain.from_iterable(list_all_tag))) if i !='O']
label2ind = {}
ind2label = {}
for ind,i in enumerate(list_labels):
    label2ind[i]=ind
    ind2label[ind]=i

In [None]:
print(label2ind)
print(ind2label)

{'O': 0, 'B-art': 1, 'B-gpe': 2, 'I-org': 3, 'I-geo': 4, 'B-per': 5, 'B-tim': 6, 'I-gpe': 7, 'I-per': 8, 'I-art': 9, 'I-tim': 10, 'B-geo': 11, 'I-eve': 12, 'B-org': 13, 'I-nat': 14, 'B-eve': 15, 'B-nat': 16}
{0: 'O', 1: 'B-art', 2: 'B-gpe', 3: 'I-org', 4: 'I-geo', 5: 'B-per', 6: 'B-tim', 7: 'I-gpe', 8: 'I-per', 9: 'I-art', 10: 'I-tim', 11: 'B-geo', 12: 'I-eve', 13: 'B-org', 14: 'I-nat', 15: 'B-eve', 16: 'B-nat'}


In [None]:
labels_ind_list = df['Tag'].apply(lambda x:
                [label2ind[i] for i in x]
               ).to_list()

text_list = df['Sentence'].apply(lambda x:x.split(' ')).to_list()

data_dict = {'id':list(range(len(text_list))),'tokens':text_list,'ner_tags':labels_ind_list}

In [None]:
new_df = pd.DataFrame(data_dict)
new_df.head()

Unnamed: 0,id,tokens,ner_tags
0,0,"[Thousands, of, demonstrators, have, marched, ...","[0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 11, 0, 0..."
1,1,"[Families, of, soldiers, killed, in, the, conf...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[They, marched, from, the, Houses, of, Parliam...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 4, 0]"
3,3,"[Police, put, the, number, of, marchers, at, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[The, protest, comes, on, the, eve, of, the, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 13..."


In [None]:
model = AutoModelForTokenClassification.from_pretrained('distilbert/distilbert-base-uncased',
                                                        num_labels=len(label2ind),
                                                        id2label=ind2label,
                                                        label2id=label2ind)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for name, param in model.named_parameters():
    #print(name)
    if name.startswith("distilbert.embeddings"):
        param.requires_grad = False
        print(name)

distilbert.embeddings.word_embeddings.weight
distilbert.embeddings.position_embeddings.weight
distilbert.embeddings.LayerNorm.weight
distilbert.embeddings.LayerNorm.bias


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.

                label_ids.append(label[word_idx])

            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)

In [None]:
dataset_dict = datasets.DatasetDict()
dataset_dict['train'] = datasets.Dataset.from_pandas(train_df)
dataset_dict['test'] = datasets.Dataset.from_pandas(test_df)

tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/38367 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/9592 [00:00<?, ? examples/s]

In [None]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 38367
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', '__index_level_0__'],
        num_rows: 9592
    })
})

In [None]:
example = tokenized_datasets['train'][0]
print(example)

{'id': 7707, 'tokens': ['The', '58-year-old', 'former', 'analyst', 'says', 'he', 'provided', 'information', 'to', 'an', 'official', 'at', 'the', 'Israeli', 'embassy', 'and', 'to', 'two', 'members', 'of', 'a', 'lobbying', 'group', 'called', 'the', 'American', 'Israel', 'Public', 'Affairs', 'Committee', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 13, 3, 3, 0], '__index_level_0__': 7707, 'input_ids': [101, 1996, 5388, 1011, 2095, 1011, 2214, 2280, 12941, 2758, 2002, 3024, 2592, 2000, 2019, 2880, 2012, 1996, 5611, 8408, 1998, 2000, 2048, 2372, 1997, 1037, 19670, 2177, 2170, 1996, 2137, 3956, 2270, 3821, 2837, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, -100, -100, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 13, 3, 3, 0, -100]}


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
seqeval = evaluate.load('seqeval')
labels = [ind2label[i] for i in example[f"ner_tags"]]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [ind2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ind2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
training_args = TrainingArguments(
    output_dir=".",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1185,0.112733,0.797282,0.815018,0.806053,0.96507
2,0.0939,0.103964,0.824702,0.815197,0.819922,0.968119
3,0.0797,0.101116,0.816273,0.826409,0.82131,0.968607
4,0.0655,0.106772,0.820382,0.823863,0.822119,0.968377
5,0.0562,0.111727,0.826212,0.83101,0.828605,0.969378
6,0.0472,0.11809,0.821411,0.830474,0.825918,0.968942
7,0.0394,0.123766,0.818118,0.831457,0.824734,0.968655
8,0.0338,0.12989,0.816288,0.831457,0.823803,0.968631
9,0.0288,0.137471,0.82019,0.832574,0.826336,0.968789
10,0.0254,0.138434,0.819557,0.832261,0.825861,0.968688


TrainOutput(global_step=23980, training_loss=0.06230978241953082, metrics={'train_runtime': 2041.2342, 'train_samples_per_second': 187.96, 'train_steps_per_second': 11.748, 'total_flos': 4381322689087914.0, 'train_loss': 0.06230978241953082, 'epoch': 10.0})

In [None]:
def tokenize_and_align_labels2(examples):
    tokenized_inputs = tokenizer2(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.

                label_ids.append(label[word_idx])

            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
text = [example['text'] for example in dataset]
tokenizer2 = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer2.train_new_from_iterator(text,vocab_size=tokenizer.vocab_size)

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer2.save_pretrained('custom_bert_tokenizer')

('custom_bert_tokenizer/tokenizer_config.json',
 'custom_bert_tokenizer/special_tokens_map.json',
 'custom_bert_tokenizer/vocab.txt',
 'custom_bert_tokenizer/added_tokens.json',
 'custom_bert_tokenizer/tokenizer.json')

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained('custom_bert_tokenizer')
data_collator2 = DataCollatorForTokenClassification(tokenizer=tokenizer2)

In [None]:
training_args = TrainingArguments(
    output_dir=".",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer2,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0581,0.110079,0.823618,0.828107,0.825856,0.968971
2,0.0444,0.122983,0.818547,0.82842,0.823454,0.968334
3,0.0345,0.130208,0.806774,0.829983,0.818214,0.967253
4,0.0355,0.133629,0.819328,0.830162,0.824709,0.968401
5,0.0299,0.145208,0.825599,0.831278,0.828429,0.969124
6,0.0249,0.150145,0.816886,0.826811,0.821819,0.968071
7,0.0199,0.158884,0.821724,0.828956,0.825324,0.968521
8,0.0164,0.165582,0.814782,0.830251,0.822444,0.968167
9,0.0142,0.172355,0.819207,0.834137,0.826605,0.968765
10,0.0115,0.17331,0.819886,0.832485,0.826137,0.968626


TrainOutput(global_step=23980, training_loss=0.02881605040539892, metrics={'train_runtime': 2047.4621, 'train_samples_per_second': 187.388, 'train_steps_per_second': 11.712, 'total_flos': 4381322689087914.0, 'train_loss': 0.02881605040539892, 'epoch': 10.0})

## MLM

In [None]:
from torch.utils.data import Dataset

class PretrainingDataset(Dataset):
    def __init__(self, texts, tokenizer, texts_pair=None, max_length=512):
        super().__init__()

        self.texts = texts
        self.texts_pair = texts_pair
        self.tokenizer = tokenizer
        self.max_length = max_length

        if self.texts_pair is not None:
            assert len(self.texts) == len(self.texts_pair)

    def __len__(self):
        return len(self.texts)

    def tokenize(self, text, text_pair=None):
        return self.tokenizer(
            text=text,
            text_pair=text_pair,
            max_length=self.max_length,
            truncation=True,
            padding=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_offsets_mapping=False,
            return_tensors=None,
        )

    def __getitem__(self, index):
        text = self.texts[index]

        text_pair = None
        if self.texts_pair is not None:
            text_pair = self.texts_pair[index]

        tokenized = self.tokenize(text)

        return tokenized

In [None]:
text = df['Sentence'].values
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 512

In [None]:
dataset = PretrainingDataset(
    texts=text,
    tokenizer=tokenizer,
    max_length = max_length
)

In [None]:
dataset[2]

{'input_ids': [101, 2027, 9847, 2013, 1996, 3506, 1997, 3323, 2000, 1037, 8320, 1999, 11804, 2380, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [None]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_name)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [None]:
training_args = TrainingArguments(
    output_dir=".",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

----------------------------------------------------------

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling,DataCollatorForWholeWordMask

In [None]:
class TokenizedSentencesDataset:
  def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
      self.tokenizer = tokenizer
      self.sentences = sentences
      self.max_length = max_length
      self.cache_tokenization = cache_tokenization

  def __getitem__(self, item):
      if not self.cache_tokenization:
          return self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)

      if isinstance(self.sentences[item], str):
          self.sentences[item] = self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
      return self.sentences[item]

  def __len__(self):
      return len(self.sentences)

In [None]:
max_length = 100
mlm_prob=0.15
train_dataset = TokenizedSentencesDataset(df['Sentence'].to_list()[:260], tokenizer2, max_length)
dev_dataset = TokenizedSentencesDataset(df['Sentence'].to_list()[260:], tokenizer2, max_length, cache_tokenization=True) if len(df['Sentence'].to_list()[:260]) > 0 else None


In [None]:
do_whole_word_mask = True
if do_whole_word_mask:
  data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer2, mlm=True, mlm_probability=mlm_prob)
else:
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer2, mlm=True, mlm_probability=mlm_prob)

In [None]:
model3 = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir= ".",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_gpu_train_batch_size= 16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model3,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss


TrainOutput(global_step=85, training_loss=2.1811097986557906, metrics={'train_runtime': 9.6943, 'train_samples_per_second': 134.099, 'train_steps_per_second': 8.768, 'total_flos': 14372529096096.0, 'train_loss': 2.1811097986557906, 'epoch': 5.0})

In [None]:
model3.save_pretrained('./saved_model3')

In [None]:
model4 = AutoModelForTokenClassification.from_pretrained(
    'saved_model3', num_labels=17, id2label=ind2label, label2id=label2ind
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at saved_model3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for name, param in model4.named_parameters():
    if name.startswith("distilbert.embeddings"):
        param.requires_grad = False

dataset_dict = datasets.DatasetDict()
dataset_dict['train'] = datasets.Dataset.from_pandas(train_df)
dataset_dict['test'] = datasets.Dataset.from_pandas(test_df)

tokenized_dataset = dataset_dict.map(tokenize_and_align_labels2, batched=True)

training_args = TrainingArguments(
    output_dir=".",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model4,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer2,
    data_collator=data_collator2,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/38367 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/9592 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1188,0.113265,0.793459,0.813946,0.803572,0.964902
2,0.0937,0.103346,0.819606,0.815688,0.817642,0.968186
3,0.0802,0.101042,0.81407,0.82498,0.819489,0.968368
4,0.0662,0.106284,0.819839,0.825918,0.822867,0.968741
5,0.0569,0.111659,0.822402,0.827839,0.825111,0.968913
6,0.0477,0.117609,0.823522,0.829224,0.826363,0.969224
7,0.0401,0.121716,0.822582,0.830519,0.826532,0.969033
8,0.0344,0.128697,0.817051,0.830117,0.823532,0.968703
9,0.0298,0.134961,0.819896,0.833155,0.826472,0.968918
10,0.0261,0.136259,0.820802,0.833378,0.827042,0.969043


TrainOutput(global_step=23980, training_loss=0.06316071264538992, metrics={'train_runtime': 2032.8716, 'train_samples_per_second': 188.733, 'train_steps_per_second': 11.796, 'total_flos': 4381322689087914.0, 'train_loss': 0.06316071264538992, 'epoch': 10.0})