[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jvdzwaan/ocrpostcorrection/blob/main/colab/icdar-task1-hf-train.ipynb)

In [None]:
from google.colab import drive 
drive.mount('/mntDrive')

In [None]:
!git clone https://github.com/jvdzwaan/ocrpostcorrection.git

In [None]:
!pip install ./ocrpostcorrection

In [None]:
!pip install datasets

In [None]:
!ls /mntDrive/MyDrive/

In [None]:
from datasets import load_from_disk

icdar_dataset = load_from_disk('/mntDrive/MyDrive/icdar-seq_len-150')

In [None]:
icdar_dataset

DatasetDict({
    train: Dataset({
        features: ['key', 'start_token_id', 'score', 'tokens', 'tags', 'language'],
        num_rows: 23796
    })
    val: Dataset({
        features: ['key', 'start_token_id', 'score', 'tokens', 'tags', 'language'],
        num_rows: 2651
    })
    test: Dataset({
        features: ['key', 'start_token_id', 'score', 'tokens', 'tags', 'language'],
        num_rows: 7010
    })
})

In [None]:
# filter dataset
icdar_dataset = icdar_dataset.filter(lambda x: x['score'] < 0.3)

100%|██████████| 24/24 [00:13<00:00,  1.77ba/s]
100%|██████████| 3/3 [00:01<00:00,  2.02ba/s]
100%|██████████| 8/8 [00:03<00:00,  2.11ba/s]


In [None]:
from transformers import AutoTokenizer

model_name = 'bert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from ocrpostcorrection.token_classification import tokenize_and_align_labels

tokenized_icdar = icdar_dataset.map(tokenize_and_align_labels(tokenizer), batched=True)

100%|██████████| 24/24 [01:42<00:00,  4.28s/ba]
100%|██████████| 3/3 [00:15<00:00,  5.21s/ba]
100%|██████████| 8/8 [00:43<00:00,  5.42s/ba]


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='/mntDrive/MyDrive/results-0.3',          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,
    load_best_model_at_end=True,
    save_strategy='epoch',
    per_device_train_batch_size=16
)

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_icdar['train'],         # training dataset
    eval_dataset=tokenized_icdar['val'],            # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
trainer.save_model()