In [None]:
from google.colab import drive 
drive.mount('/mntDrive')

In [None]:
!pip install transformers datasets

In [None]:
!ls /mntDrive/MyDrive/

In [None]:
from datasets import load_from_disk

icdar_dataset = load_from_disk('/mntDrive/MyDrive/icdar-0.3')

In [None]:
icdar_dataset

In [None]:
from transformers import AutoTokenizer

model_name = 'bert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Source: https://huggingface.co/docs/transformers/custom_datasets#token-classification-with-wnut-emerging-entities
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:                            # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:              # Only label the first token of a given word.
                label_ids.append(label[word_idx])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [None]:
tokenized_icdar = icdar_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/mntDrive/MyDrive/results-0.3',          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,
    load_best_model_at_end=True,
    save_strategy='epoch',
    per_device_train_batch_size=16
)

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_icdar['train'],         # training dataset
    eval_dataset=tokenized_icdar['val'],            # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
trainer.save_model()