In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

from tqdm.notebook import tqdm

In [None]:
import os
os.environ['LOGURU_LEVEL'] = 'INFO'

In [None]:
import logging

from loguru import logger

class InterceptHandler(logging.Handler):
    def emit(self, record):
        # Get corresponding Loguru level if it exists
        try:
            level = logger.level(record.levelname).name
        except ValueError:
            level = record.levelno

        # Find caller from where originated the logged message
        frame, depth = logging.currentframe(), 2
        while frame.f_code.co_filename == logging.__file__:
            frame = frame.f_back
            depth += 1

        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())

logging.basicConfig(handlers=[InterceptHandler()], level=0)

In [None]:
from datasets import load_from_disk

icdar_dataset = load_from_disk('icdar-0.3')
icdar_dataset

In [None]:
model_dir = '/Users/janneke/models/results-0.3'
model_name = 'bert-base-multilingual-cased'

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Source: https://huggingface.co/docs/transformers/custom_datasets#token-classification-with-wnut-emerging-entities
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:                            # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:              # Only label the first token of a given word.
                label_ids.append(label[word_idx])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [None]:
tokenized_icdar = icdar_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results-0.3',          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,
)

model = AutoModelForTokenClassification.from_pretrained(model_dir, num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_icdar['train'],         # training dataset
    eval_dataset=tokenized_icdar['val'],            # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
pred = trainer.predict(tokenized_icdar['test'])

In [None]:
pred.predictions

In [None]:
from datautils import generate_data

in_dir = Path('../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')
data_test, X_test = generate_data(in_dir)

In [None]:
from datautils import generate_sentences

test_data = generate_sentences(X_test, data_test, size=35, step=30)

In [None]:
from collections import defaultdict, Counter

def convert_predictions(samples, pred):
    #print('samples', len(samples))
    #print(samples)
    #print(samples[0].keys())
    #for sample in samples:
    #    print(sample.keys()) 

    tokenized_samples = tokenizer(samples["tokens"], truncation=True, is_split_into_words=True)

    print(type(tokenized_samples))
    #print(samples)

    #for sample in samples:
    #    print(sample.keys())    
    # convert predictions to labels (label_ids)
    p = np.argmax(pred.predictions, axis=2)
    #print(p)

    converted = defaultdict(dict)

    for i, (sample, preds) in enumerate(zip(samples, p)):
        #print(sample.keys())
        #label = sample['tags']
        #print(label)
        #print(len(preds), preds)
        word_ids = tokenized_samples.word_ids(batch_index=i)  # Map tokens to their respective word.
        #print(len(word_ids), word_ids)
        result = defaultdict(list)
        for word_idx, p_label in zip(word_ids, preds):
            #print(word_idx, p_label)
            if word_idx is not None:
                result[word_idx].append(p_label)
        
        new_tags = []
        for word_idx, preds in result.items():
            new_tag = 0
            c = Counter(preds)
            #print(c)
            if c[1] > 0 and c[1] >= c[2]:
                new_tag = 1
            elif c[2] > 0 and c[2] >= c[1]:
                new_tag = 2
            
            new_tags.append(new_tag)

        #print('pred', len(new_tags), new_tags)
        #print('tags', len(label), label)
        
        #print(sample)
        #print(sample['key'], sample['start_token_id'])
        converted[sample['key']][sample['start_token_id']] = new_tags

    return converted


result = convert_predictions(icdar_dataset['test'], pred)

In [None]:
result

In [None]:
import json

with open('condensed_predictions_task1.json', 'w') as f:
    json.dump(result, f)

In [None]:
import json

with open('condensed_predictions_task1.json', 'r') as f:
    result = json.load(f)

In [None]:
import re

def extract_icdar_output(label_str, input_tokens):
    text_output = {}

    # Correct use of 2 (always following a 1)
    regex = r'12*'

    for match in re.finditer(regex, label_str):
        #print(match)
        #print(match.group())
        num_tokens = len(match.group())
        idx = input_tokens[match.start()].start
        text_output[f'{idx}:{num_tokens}'] = {}

    # Incorrect use of 2 (following a 0) -> interpret first 2 as 1
    regex = r'02+'

    for match in re.finditer(regex, label_str):
        #print(match)
        #print(match.group())
        num_tokens = len(match.group()) - 1
        idx = input_tokens[match.start()+1].start
        text_output[f'{idx}:{num_tokens}'] = {}
    
    return text_output

#label_str = '12200010011120020222'
#output = extract_icdar_output(label_str, data['DE/DE3/1988.txt'].input_tokens)
#output

In [None]:
from collections import defaultdict

output = {}

for key, preds in result.items():
    labels = defaultdict(list)
    #print(key)
    try:
        text = data_test[key]
        #print(len(text.input_tokens))
        #print(preds)
        for start, lbls in preds.items():
            for i, label in enumerate(lbls):
                labels[int(start)+i].append(label)
        #print('LABELS')
        #print(labels)

        label_str = []

        for i, token in enumerate(text.input_tokens):
            #print(i, token, labels[i])
            if 2 in labels[i]:
                label_str.append('2')
            elif 1 in labels[i]:
                label_str.append('1')
            else:
                label_str.append('0')
        label_str = ''.join(label_str)

        #print('LABEL STR')
        #print(label_str)

        output[key] = extract_icdar_output(label_str, text.input_tokens)
    except KeyError:
        logger.warning(f'No data found for text {key}')


In [None]:
output

In [None]:
import json

with open('results_task1.json', 'w') as f:
    json.dump(output, f)

In [None]:
!python evalTool_ICDAR2017.py ../../data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish results_task1.json results_task1.csv