[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jvdzwaan/ocrpostcorrection/blob/main/colab/icdar-task1-hf-evaluation.ipynb)

In [None]:
from google.colab import drive 
drive.mount('/mntDrive')

In [None]:
!git clone https://github.com/jvdzwaan/ocrpostcorrection.git

In [None]:
!pip install ./ocrpostcorrection

In [None]:
!pip install datasets

In [None]:
from datasets import load_from_disk

icdar_dataset = load_from_disk('/mntDrive/MyDrive/icdar-seq_len-150')
#icdar_dataset = load_from_disk('../../data/ocrpostcorrection/icdar-seq_len-150')

In [None]:
for split in icdar_dataset.keys():
    icdar_dataset[split] = icdar_dataset[split].select(range(5))

In [None]:
model_dir = '/mntDrive/MyDrive/results-seq_len-150-0.3'
#model_dir = '/Users/janneke/models/results-seq_len-150-0.3'
model_name = 'bert-base-multilingual-cased'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from ocrpostcorrection.token_classification import tokenize_and_align_labels

tokenized_icdar = icdar_dataset.map(tokenize_and_align_labels(tokenizer), batched=True)

Loading cached processed dataset at ../../data/ocrpostcorrection/icdar-seq_len-150/train/cache-545442fc28fa86aa.arrow
Loading cached processed dataset at ../../data/ocrpostcorrection/icdar-seq_len-150/val/cache-062ede93cdb3e997.arrow
Loading cached processed dataset at ../../data/ocrpostcorrection/icdar-seq_len-150/test/cache-fcd3d83d9a6f0814.arrow


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=model_dir,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,
)

model = AutoModelForTokenClassification.from_pretrained(model_dir, num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_icdar['train'],         # training dataset
    eval_dataset=tokenized_icdar['val'],            # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [None]:
pred = trainer.predict(tokenized_icdar['test'])

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, score, start_token_id, tags, language, key.
***** Running Prediction *****
  Num examples = 5
  Batch size = 8
  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from pathlib import Path

out_dir = Path('/mntDrive/MyDrive/results/icdar-seq_len-150')
out_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# Save predictions
import numpy as np

out_file = out_dir/'predictions'

np.save(out_file, pred.predictions)

# Can be run locally

In [None]:
# Load predictions
import numpy as np

in_file = out_dir/'predictions.npy'

predictions = np.load(in_file)

In [None]:
from ocrpostcorrection.icdar_data import generate_data

in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')
data_test, X_test = generate_data(in_dir)

11it [00:30,  2.74s/it]


In [None]:
from ocrpostcorrection.utils import predictions2icdar_output, predictions_to_labels

output = predictions2icdar_output(tokenized_icdar['test'],
                                  predictions_to_labels(predictions),
                                  tokenizer,
                                  data_test)

In [None]:
import json

out_file = out_dir/'results_task1_unfiltered_test_data.json'

with open(out_file, 'w') as f:
    json.dump(output, f)

In [None]:
csv_file = out_file.with_suffix('.csv')

In [None]:
from ocrpostcorrection.utils import runEvaluation

runEvaluation(in_dir, out_file, csv_file)

File	NbTokens	NbErroneousTokens	NbSymbolsConsidered	T1_Precision	T1_Recall	T1_Fmesure	T2_AvgLVDistOriginal	T2_AvgLVDistCorrected
SL/SL1/29.txt	2	2	533	1.00	0.74	0.85	0.99	0.99
SL/SL1/15.txt	83	28	447	0.45	0.90	0.60	0.14	0.14
SL/SL1/14.txt	115	50	721	0.45	0.84	0.59	0.13	0.13
SL/SL1/28.txt	273	21	1453	0.12	0.94	0.21	0.03	0.03


In [None]:
from ocrpostcorrection.utils import aggregate_results

results = aggregate_results(csv_file)
print(results.to_markdown())

| language   |   T1_Precision |   T1_Recall |   T1_Fmesure |
|:-----------|---------------:|------------:|-------------:|
| SL         |          0.505 |       0.855 |       0.5625 |
