This notebook was developed in Sagemaker Studio for exploration only. The code resulting from this notebook is already integrated to the infra.

In [2]:
!pip install -q --upgrade pip
!pip install -q transformers["torch"]
!pip install -q datasets['s3']
!pip install -q evaluate
!pip install -q seqeval

[0m

In [3]:
import logging

from datasets import load_from_disk
from transformers import (
    LayoutLMForTokenClassification, 
    LayoutLMTokenizerFast,
    Trainer, 
    TrainingArguments, 
    set_seed
)
import numpy as np
import evaluate
import torch

from invoice_reader.model.code.preprocess import preprocess, batched_map_fn

logging.basicConfig(level=logging.getLevelName("INFO"))
LOGGER = logging.getLogger(__name__)

  from .autonotebook import tqdm as notebook_tqdm


# Training

In [8]:
# Seqeval only accepts BIO format
label_list = ["O", "B-S-Name", "B-S-Adress", "B-B-name", "B-B-adress", "B-Total-Net", "B-Total-Gross"]

In [9]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(
        predictions=true_predictions, 
        references=true_labels, 
        mode="strict" 
    )
    print(results)
    return results

In [10]:
def train(
    labels,
    pretrained_model_name,
    dataset_dir,
    output_dir,
    lr,
    epochs,
    per_device_train_batch_size,
    per_device_eval_batch_size,
):
    """"""
    LOGGER.info("Start training script.")

    id2label = {v: k for v, k in enumerate(labels)}
    label2id = {k: v for v, k in enumerate(labels)}

    LOGGER.info(f"Load model and tokenizer from {pretrained_model_name}, with labels {id2label} ")
    tokenizer = LayoutLMTokenizerFast.from_pretrained(pretrained_model_name)
    model = LayoutLMForTokenClassification.from_pretrained(
        pretrained_model_name,
        num_labels=len(labels), 
        label2id=label2id, 
        id2label=id2label
    )

    LOGGER.info("Start preprocessing.")
    preprocessed_dataset = preprocess(
        dataset=load_from_disk(dataset_path=dataset_dir, keep_in_memory=False),
        tokenizer=tokenizer,
        labels_ref=labels
    )

    LOGGER.info(f"Preprocessed dataset: {preprocessed_dataset}")

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        # fp16=args.fp16,
        learning_rate=lr,
        # logging & evaluation strategies
        logging_dir=f"{output_dir}/logs",
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="overall_f1"
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=preprocessed_dataset["train"],
        eval_dataset=preprocessed_dataset["test"],
        compute_metrics=compute_metrics
    )

    LOGGER.info("Start training:")
    trainer.train()
    
    # Save
    LOGGER.info("Start saving tokenizer and model into model artifact.")
    tokenizer.save_pretrained(output_dir)
    trainer.model.save_pretrained(output_dir, safe_serialization=True)
    
    LOGGER.info("Training script over.")

In [11]:
train(
    labels = ["O", "S-name", "S-adress", "B-name", "B-adress", "Total net", "Total gross"],
    pretrained_model_name = "microsoft/layoutlm-base-uncased",
    dataset_dir = "s3://invoice-reader-project/data/training/datasets/dataset_ocr_v1/",
    output_dir = "opt/model",
    lr = 3e-5,
    epochs = 0.1,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4
)
    

INFO:__main__:Start training script.
INFO:__main__:Load model and tokenizer from microsoft/layoutlm-base-uncased, with labels {0: 'O', 1: 'S-name', 2: 'S-adress', 3: 'B-name', 4: 'B-adress', 5: 'Total net', 6: 'Total gross'} 
Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'layoutlm.embeddings.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:Start preprocessing.
INFO:invoice_reader.model.code.preprocess:Dataset info: DatasetDict({
    train: Dataset({
        features: ['doc_id', 'bboxes', 'words', 'labels', 'original_width', 'original_height'],
        num_rows: 48
    })
    test: Dataset({
        features: ['doc_id', 'bboxes', 'words', 'labels', 'original_width', 'original_height'],
        num_rows: 21
    })
})
Map: 100%|██████████| 4

Epoch,Training Loss,Validation Loss,B-adress,B-name,S-adress,S-name,Total-gross,Total-et,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
0,1.6791,1.044717,"{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 159}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 49}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 153}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 54}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}","{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}",0.0,0.0,0.0,0.831905


{'B-adress': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 159}, 'B-name': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 49}, 'S-Adress': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 153}, 'S-Name': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 54}, 'Total-Gross': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}, 'Total-et': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.8319050758075148}


  _warn_prf(average, modifier, msg_start, len(result))
INFO:__main__:Start saving tokenizer and model into model artifact.
INFO:__main__:Training script over.


# Inference

In [4]:
trained_model_path = "opt/model/"
dataset_uri ="s3://invoice-reader-project/data/training/datasets/dataset_ocr_v1/"

tokenizer = LayoutLMTokenizerFast.from_pretrained(trained_model_path)
model = LayoutLMForTokenClassification.from_pretrained(trained_model_path)
dataset = load_from_disk(dataset_uri, keep_in_memory=False)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [5]:
labels = ["O", "S-name", "S-adress", "B-name", "B-adress", "Total net", "Total gross"]
inputs = dataset["train"][:2]
encoding, word_ids = batched_map_fn(inputs, tokenizer, labels, inference_mode=True)
encoding.convert_to_tensors("pt")["input_ids"].shape
encoding["input_ids"]

tensor([[ 101, 1999, 6767,  ...,    0,    0,    0],
        [ 101, 1999, 6767,  ...,    0,    0,    0]])

In [6]:
output = model(**encoding)
logits = output.logits
logits.shape

torch.Size([2, 512, 7])

In [7]:
pred_tags = [[model.config.id2label[t.item()] for t in logits_batch] for logits_batch in output.logits.argmax(-1)]
pred_tags

[['S-name',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
 

In [8]:
tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])

['[CLS]',
 'in',
 '##vo',
 '##ice',
 'no',
 ':',
 '62',
 '##7',
 '##8',
 '##80',
 '##12',
 'date',
 'of',
 'issue',
 ':',
 'seller',
 ':',
 'barker',
 '-',
 'fletcher',
 '46',
 '##8',
 '##9',
 'pamela',
 'court',
 'suite',
 '38',
 '##4',
 'courtney',
 '##chester',
 ',',
 'ut',
 '248',
 '##53',
 'tax',
 'id',
 ':',
 '97',
 '##7',
 '-',
 '94',
 '-',
 '1841',
 'ib',
 '##an',
 ':',
 'gb',
 '##44',
 '##tf',
 '##da',
 '##45',
 '##6',
 '##32',
 '##9',
 '##8',
 '##7',
 '##32',
 '##29',
 '##48',
 'items',
 'no',
 '.',
 'description',
 '1',
 '.',
 'vintage',
 'antique',
 'brass',
 'table',
 'top',
 'wine',
 'bottle',
 'opener',
 '/',
 'wood',
 'handle',
 '2',
 '.',
 'wall',
 'mounted',
 'upside',
 'down',
 'wine',
 'bottle',
 'rack',
 'go',
 '##ble',
 '##t',
 'glass',
 'holder',
 'storage',
 'organizer',
 '3',
 '}',
 ',',
 'wine',
 'spoon',
 'summary',
 'total',
 'q',
 '##ty',
 '2',
 ',',
 '00',
 '5',
 ',',
 '00',
 '1',
 ',',
 '00',
 'va',
 '##t',
 '[',
 '%',
 ']',
 '10',
 '%',
 '03',
 '/',
 '12

In [9]:
encoding.word_ids()

[None,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 4,
 5,
 5,
 6,
 6,
 7,
 7,
 7,
 8,
 8,
 8,
 9,
 10,
 11,
 12,
 12,
 13,
 13,
 13,
 14,
 15,
 15,
 16,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18,
 19,
 19,
 19,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 21,
 22,
 22,
 23,
 24,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 33,
 34,
 35,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 43,
 43,
 44,
 45,
 46,
 47,
 48,
 48,
 48,
 49,
 50,
 51,
 52,
 53,
 53,
 54,
 54,
 54,
 55,
 55,
 55,
 56,
 56,
 56,
 57,
 57,
 58,
 58,
 58,
 59,
 59,
 60,
 60,
 60,
 60,
 60,
 61,
 62,
 63,
 64,
 65,
 65,
 66,
 66,
 67,
 68,
 69,
 70,
 70,
 71,
 72,
 73,
 74,
 74,
 75,
 76,
 76,
 77,
 78,
 78,
 79,
 79,
 79,
 79,
 79,
 79,
 80,
 81,
 82,
 82,
 82,
 83,
 83,
 83,
 84,
 84,
 84,
 85,
 86,
 87,
 87,
 87,
 88,
 89,
 89,
 89,
 90,
 91,
 92,
 92,
 92,
 93,
 93,
 93,
 94,
 94,
 94,
 95,
 95,
 96,
 96,
 96,
 97,
 97,
 98,
 98,
 99,
 99,
 100,
 100,
 101,
 101,
 101,
 102,
 103,
 103,
 10

In [10]:
scores, _ = torch.max(logits.softmax(axis=-1), dim=-1)
scores = scores.tolist()

In [11]:
bboxes = encoding["bbox"].tolist()
bboxes

[[[0, 0, 0, 0],
  [82, 32, 161, 43],
  [82, 32, 161, 43],
  [82, 32, 161, 43],
  [170, 35, 203, 43],
  [170, 35, 203, 43],
  [213, 33, 322, 43],
  [213, 33, 322, 43],
  [213, 33, 322, 43],
  [213, 33, 322, 43],
  [213, 33, 322, 43],
  [82, 62, 124, 72],
  [131, 61, 148, 72],
  [155, 61, 205, 72],
  [155, 61, 205, 72],
  [82, 190, 151, 202],
  [82, 190, 151, 202],
  [87, 218, 215, 227],
  [87, 218, 215, 227],
  [87, 218, 215, 227],
  [86, 233, 127, 242],
  [86, 233, 127, 242],
  [86, 233, 127, 242],
  [135, 233, 194, 242],
  [201, 233, 246, 242],
  [253, 233, 294, 242],
  [301, 233, 331, 242],
  [301, 233, 331, 242],
  [86, 248, 228, 260],
  [86, 248, 228, 260],
  [86, 248, 228, 260],
  [237, 249, 258, 257],
  [265, 248, 316, 257],
  [265, 248, 316, 257],
  [85, 279, 115, 288],
  [122, 279, 140, 288],
  [122, 279, 140, 288],
  [149, 279, 254, 288],
  [149, 279, 254, 288],
  [149, 279, 254, 288],
  [149, 279, 254, 288],
  [149, 279, 254, 288],
  [149, 279, 254, 288],
  [87, 294, 129, 303

In [12]:
# Result DICT for all tokens
batched_tags = []
for batch_idx, input_ids in enumerate(encoding["input_ids"]):
    tags = []
    previous_word_idx = 0
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    word_ids = encoding.word_ids(batch_index=batch_idx)
    word = ""
    for i, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        elif previous_word_idx != word_idx:
            # +1 because of [CLS] token
            tags.append({
                "word": word, 
                "label": pred_tags[batch_idx][previous_word_idx + 1],
                "score": scores[batch_idx][previous_word_idx + 1],
                "bbox": bboxes[batch_idx][previous_word_idx + 1]
            })
            word = tokens[i]
        else:
            word += tokens[i] 
        previous_word_idx = word_idx
    for tag in tags:
        tag["word"] = tag["word"].replace("##", "")
    batched_tags.append(tags)

batched_tags

[[{'word': 'invoice',
   'label': 'O',
   'score': 0.40164637565612793,
   'bbox': [82, 32, 161, 43]},
  {'word': 'no:',
   'label': 'O',
   'score': 0.35825634002685547,
   'bbox': [82, 32, 161, 43]},
  {'word': '62788012',
   'label': 'O',
   'score': 0.37732383608818054,
   'bbox': [82, 32, 161, 43]},
  {'word': 'date',
   'label': 'O',
   'score': 0.3978097438812256,
   'bbox': [170, 35, 203, 43]},
  {'word': 'of',
   'label': 'O',
   'score': 0.3641010820865631,
   'bbox': [170, 35, 203, 43]},
  {'word': 'issue:',
   'label': 'O',
   'score': 0.39645540714263916,
   'bbox': [213, 33, 322, 43]},
  {'word': 'seller:',
   'label': 'O',
   'score': 0.3103733956813812,
   'bbox': [213, 33, 322, 43]},
  {'word': 'barker-fletcher',
   'label': 'O',
   'score': 0.3559877574443817,
   'bbox': [213, 33, 322, 43]},
  {'word': '4689',
   'label': 'O',
   'score': 0.32526642084121704,
   'bbox': [213, 33, 322, 43]},
  {'word': 'pamela',
   'label': 'O',
   'score': 0.3951438367366791,
   'bbox

In [15]:
# Result DICT for labels only
# for tags in batched_tags:
#     for tag in tags:
#         if tag["label"] != "O":

result = [[tag for tag in tags] for tags in batched_tags if tag["label"] == "O"]
result

[[{'word': 'invoice',
   'label': 'O',
   'score': 0.40164637565612793,
   'bbox': [82, 32, 161, 43]},
  {'word': 'no:',
   'label': 'O',
   'score': 0.35825634002685547,
   'bbox': [82, 32, 161, 43]},
  {'word': '62788012',
   'label': 'O',
   'score': 0.37732383608818054,
   'bbox': [82, 32, 161, 43]},
  {'word': 'date',
   'label': 'O',
   'score': 0.3978097438812256,
   'bbox': [170, 35, 203, 43]},
  {'word': 'of',
   'label': 'O',
   'score': 0.3641010820865631,
   'bbox': [170, 35, 203, 43]},
  {'word': 'issue:',
   'label': 'O',
   'score': 0.39645540714263916,
   'bbox': [213, 33, 322, 43]},
  {'word': 'seller:',
   'label': 'O',
   'score': 0.3103733956813812,
   'bbox': [213, 33, 322, 43]},
  {'word': 'barker-fletcher',
   'label': 'O',
   'score': 0.3559877574443817,
   'bbox': [213, 33, 322, 43]},
  {'word': '4689',
   'label': 'O',
   'score': 0.32526642084121704,
   'bbox': [213, 33, 322, 43]},
  {'word': 'pamela',
   'label': 'O',
   'score': 0.3951438367366791,
   'bbox