In [1]:
%%capture
!pip install opendatasets
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install seqeval

In [2]:
import opendatasets as od

od.download("https://www.kaggle.com/datasets/urbikn/sroie-datasetv2")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: a
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/urbikn/sroie-datasetv2
Downloading sroie-datasetv2.zip to ./sroie-datasetv2


100%|██████████| 834M/834M [00:11<00:00, 78.2MB/s]





In [3]:
import os
from pathlib import Path

len(os.listdir("sroie-datasetv2/SROIE2019/train/img")), len(os.listdir("sroie-datasetv2/SROIE2019/test/img"))

(626, 347)

In [46]:
from PIL import Image
import pandas as pd
import json

labels_list = ['O', 'B-company', 'I-company', 'B-date', 'I-date', 'B-address', 'I-address', 'B-total', 'I-total']
ids2labels = {k: v for k, v in enumerate(labels_list)}
labels2ids = {v: k for k, v in enumerate(labels_list)}

def create_dataframe(base_path):
  box_path = base_path + "/box"
  img_path = base_path + "/img"
  entities_path = base_path + "/entities"

  image_path_list = []
  words_list = []
  bboxes_list = []
  ner_tags_list = []

  for file in os.listdir(img_path):
    id = Path(file).stem
    box_file_path = box_path + "/" + id + ".txt"
    entities_file_path = entities_path + "/" + id + ".txt"
    bboxes = []
    words = []
    ner_tags = []

    tags_file = open(entities_file_path)
    tags = json.load(tags_file)

    try:
      width, height = Image.open(img_path + "/" + file).size
      with open(box_file_path, "r") as f:
        for line in f:
          line = line.replace('\n', '')
          split = line.split(',')
          try:
            bboxes.append([int(split[0]) / width * 1000, int(split[1]) / height * 1000, int(split[6]) / width * 1000, int(split[7]) / height * 1000])
            words.append(','.join(split[8:]).replace('\n', ''))
          except:
            pass

      for word in words:
        if tags.get('company', '').startswith(word):
          ner_tags.append(labels2ids['B-company'])
        elif word in tags.get('company', ''):
          ner_tags.append(labels2ids['I-company'])
        elif tags.get('date', '').startswith(word):
          ner_tags.append(labels2ids['B-date'])
        elif word in tags.get('date', ''):
          ner_tags.append(labels2ids['I-date'])
        elif tags.get('address', '').startswith(word):
          ner_tags.append(labels2ids['B-address'])
        elif word in tags.get('address', ''):
          ner_tags.append(labels2ids['I-address'])
        elif tags.get('total', '').startswith(word):
          ner_tags.append(labels2ids['B-total'])
        elif word in tags.get('total', ''):
          ner_tags.append(labels2ids['I-total'])
        else:
          ner_tags.append(labels2ids['O'])
      bboxes_list.append(bboxes)
      words_list.append(words)
      ner_tags_list.append(ner_tags)
      image_path_list.append(img_path + "/" + file)
    except:
      pass

  return pd.DataFrame({'image_path': image_path_list, 'words': words_list, 'bboxes': bboxes_list, 'ner_tags': ner_tags_list})

In [47]:
train = create_dataframe("sroie-datasetv2/SROIE2019/train")
test = create_dataframe("sroie-datasetv2/SROIE2019/test")

In [48]:
len(train), len(test)

(626, 346)

In [49]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

train_dataset, test_dataset

(Dataset({
     features: ['image_path', 'words', 'bboxes', 'ner_tags'],
     num_rows: 626
 }),
 Dataset({
     features: ['image_path', 'words', 'bboxes', 'ner_tags'],
     num_rows: 346
 }))

In [50]:
train_dataset.save_to_disk("train")
test_dataset.save_to_disk("test")

Saving the dataset (0/1 shards):   0%|          | 0/626 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/346 [00:00<?, ? examples/s]

In [51]:
from transformers import AutoProcessor

model_id = "microsoft/layoutlmv3-base"

processor = AutoProcessor.from_pretrained(model_id, apply_ocr=False)



In [52]:
def prepare_examples(examples):
  images = [Image.open(path).convert('RGB') for path in examples["image_path"]]
  words = examples["words"]
  bboxes = examples["bboxes"]
  ner_tags = examples["ner_tags"]
  encoding = processor(images, words, boxes=bboxes, word_labels=ner_tags, padding="max_length", truncation=True)
  return encoding

In [53]:
from datasets import Features, Sequence, Value, Array2D, Array3D

features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})

train_dataset = train_dataset.map(prepare_examples, batched=True, remove_columns=train_dataset.column_names, batch_size=32, features=features)
test_dataset = test_dataset.map(prepare_examples, batched=True, remove_columns=test_dataset.column_names, batch_size=32, features=features)

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

Map:   0%|          | 0/346 [00:00<?, ? examples/s]

In [54]:
train_dataset.save_to_disk("processed_train")
test_dataset.save_to_disk("processed_test")

Saving the dataset (0/1 shards):   0%|          | 0/626 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/346 [00:00<?, ? examples/s]

In [55]:
train_dataset.set_format("torch")

In [56]:
import evaluate

seqeval = evaluate.load("seqeval")

In [58]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [59]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_id,
    id2label=ids2labels,
    label2id=labels2ids,
)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
training_args = TrainingArguments(output_dir="checkpoints",
                                  num_train_epochs=3,
                                  eval_steps=100,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  seed=42,
                                  data_seed=42)



In [61]:
from transformers.data.data_collator import default_data_collator

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [62]:
trainer.train()



Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
100,No log,0.305529,0.627102,0.524834,0.571429,0.904954
200,No log,0.227159,0.693698,0.65149,0.671932,0.927655
300,No log,0.210203,0.694689,0.752483,0.722432,0.932754
400,No log,0.17395,0.761805,0.754553,0.758162,0.94499
500,0.291400,0.166084,0.76292,0.769868,0.766378,0.946117
600,0.291400,0.1545,0.763454,0.75745,0.76044,0.942682
700,0.291400,0.152775,0.769857,0.782285,0.776021,0.94719
800,0.291400,0.145886,0.780771,0.779801,0.780286,0.946063
900,0.291400,0.147509,0.777002,0.783113,0.780045,0.948586
1000,0.133500,0.143281,0.795503,0.776076,0.785669,0.95084




TrainOutput(global_step=1000, training_loss=0.212427734375, metrics={'train_runtime': 853.1292, 'train_samples_per_second': 2.344, 'train_steps_per_second': 1.172, 'total_flos': 527220135936000.0, 'train_loss': 0.212427734375, 'epoch': 3.194888178913738})

In [None]:
trainer.evaluate()