In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
!rm -rf tmp-master/

In [2]:
!git clone https://github.com/israelcamp/tmp-master.git

Cloning into 'tmp-master'...
remote: Enumerating objects: 188, done.[K
remote: Counting objects: 100% (188/188), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 188 (delta 85), reused 148 (delta 45), pack-reused 0[K
Receiving objects: 100% (188/188), 5.07 MiB | 18.21 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [3]:
%pip install -q transformers imagecorruptions pytorch-ignite neptune sentencepiece evaluate jiwer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m448.1/448.1 kB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m120.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# CODE_PATH = "/content/drive/MyDrive/Mestrado/Dev/devcodes/deberta"
CODE_PATH = "../trainer"

In [2]:
import sys
sys.path.append(CODE_PATH)

In [3]:
import os

import torch
import torchvision as tv
from transformers import AutoTokenizer, T5Tokenizer

from ignite.engine import (
    Engine,
    Events,
)
from ignite.handlers import ModelCheckpoint, Checkpoint
from ignite.contrib.handlers import global_step_from_engine
from ignite.contrib.handlers import ProgressBar
from ignite.contrib.handlers.neptune_logger import NeptuneLogger

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from datamodule import SROIETask2DataModule
from model import CNNSmall, OCRModel, AbstractCNN, ImageFeatureExtractor
from model.roberta import RobertaEncoderSmall
from ctc import GreedyDecoder
from igmetrics import ExactMatch, WordF1

In [5]:
tokenizer = AutoTokenizer.from_pretrained(f"{CODE_PATH}/sroie-tokenizers/tokenizer-pad0")
# tokenizer = AutoTokenizer.from_pretrained(f"{CODE_PATH}/tokenizer-t5")
# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# tokenizer = T5Tokenizer("/content/drive/MyDrive/Mestrado/Dev/devcodes/nm_spm.model")
decoder = GreedyDecoder(tokenizer.pad_token_id)

In [6]:
tokenizer.eos_token_id, tokenizer.pad_token_id, tokenizer.vocab_size

(1, 0, 77)

In [7]:
if tokenizer.eos_token_id is None:
    tokenizer.eos_token = tokenizer.sep_token
    tokenizer.eos_token_id = tokenizer.sep_token_id

In [8]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

# Loader

In [10]:
DATA_PATH = "../data/SROIETask2"
POOLER_MODE = "mine" 
# POOLER_MODE = "crnn"

In [11]:
# !unzip {DATA_PATH}/data.zip

In [24]:
dm = SROIETask2DataModule(
    root_dir="../data/SROIETask2/data",
    label_file=os.path.join(DATA_PATH, "data.json"),
    tokenizer=tokenizer,
    height=32,
    num_workers=4,
    train_bs=4,
    valid_bs=4,
    val_pct=0.1,
    max_width=None,
    do_pool=True,
    pooler_mode=POOLER_MODE
)

In [25]:
dm.setup("fit")

# Model

In [26]:
from torch import nn
from model.cnn import Feature2Embedding

class CNNSmallDropout(AbstractCNN):
    """
    Custom CNN
    """

    def __init__(
        self,
        vocab_size: int = 100,
    ):
        super().__init__(vocab_size=vocab_size)

        self.image_embeddings = nn.Sequential(
            self.block(3, 64, st=(2, 2)),
            nn.Dropout2d(0.15),
            self.block(64, 128, st=(2, 2)),
            nn.Dropout2d(0.15),
            self.block(128, 256, st=(2, 1)),
            nn.Dropout2d(0.15),
            self.block(256, 512, st=(4, 1)),
            nn.Dropout2d(0.15),
            Feature2Embedding(),
        )
        self.lm_head = nn.Linear(512, self.vocab_size)

In [27]:
from transformers import RobertaConfig, RobertaForTokenClassification
from transformers import DebertaV2ForTokenClassification, DebertaV2Config

class RobertaEncoderSmall(torch.nn.Module):
    def __init__(self, vocab_size=100):
        super().__init__()

        config_dict = {
            "architectures": ["RobertaForTokenClassification"],
            "num_labels": vocab_size,
            "attention_probs_dropout_prob": 0.15,
            "bos_token_id": 0,
            "eos_token_id": tokenizer.eos_token_id,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.15,
            "hidden_size": 512,
            "initializer_range": 0.02,
            "intermediate_size": 768,
            "layer_norm_eps": 1e-05,
            "max_position_embeddings": 514,
            "model_type": "roberta",
            "num_attention_heads": 8,
            "num_hidden_layers": 3,
            "pad_token_id": tokenizer.pad_token_id,
            "type_vocab_size": 1,
            "vocab_size": vocab_size,
        }
        config = RobertaConfig(**config_dict)
        self.encoder = RobertaForTokenClassification(config)

    def forward(self, image_embeddings, attention_mask=None):
        outputs = self.encoder(
            inputs_embeds=image_embeddings, attention_mask=attention_mask
        )
        return outputs.logits

class AbstractTransformersEncoder(torch.nn.Module):
    def __init__(self, vocab_size: int = 100, config_dict: dict = {}):
        super().__init__()
        self.vocab_size = vocab_size
        config_dict = self._get_config_dict(config_dict)
        config = DebertaV2Config(**config_dict)
        self.encoder = DebertaV2ForTokenClassification(config)

    def _get_config_dict(self, config_dict):
        base_config_dict = {
            "model_type": "deberta-v2",
            "architectures": ["DebertaV2ForTokenClassification"],
            "num_labels": self.vocab_size,
            "model_type": "deberta-v2",
            "attention_probs_dropout_prob": 0.15,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.15,
            "hidden_size": 512,
            "initializer_range": 0.02,
            "intermediate_size": 768,  # 3072,
            "max_position_embeddings": 512,
            "relative_attention": True,
            "position_buckets": 64,  # TODO: Maybe less?
            "norm_rel_ebd": "layer_norm",
            "share_att_key": True,
            "pos_att_type": "p2c|c2p",
            "layer_norm_eps": 1e-7,
            "max_relative_positions": -1,
            "position_biased_input": True,
            "num_attention_heads": 8,
            "num_hidden_layers": 3,
            "type_vocab_size": 0,
            "pad_token_id": tokenizer.pad_token_id,
            "eos_token_id": tokenizer.eos_token_id,
            "vocab_size": self.vocab_size,
        }
        base_config_dict.update(config_dict)
        return base_config_dict

    def forward(self, image_embeddings, attention_mask=None):
        outputs = self.encoder(
            inputs_embeds=image_embeddings, attention_mask=attention_mask
        )
        return outputs.logits

In [28]:
vis_model = CNNSmallDropout(vocab_size=100)#tokenizer.vocab_size)
rec_model = AbstractTransformersEncoder(vocab_size=100)#tokenizer.vocab_size)
model = OCRModel(vis_model, rec_model)

In [29]:
batch = next(iter(dm.train_dataloader()))
with torch.no_grad():
    images, labels, attention_mask, attention_image = batch
    images_embedding = model.visual_model(images)
    logits = model.rec_model(images_embedding, attention_mask=attention_image)
    cnn_lm_logits = model.cnn_lm(images_embedding)
logits.shape, cnn_lm_logits.shape

(torch.Size([4, 86, 100]), torch.Size([4, 86, 100]))

In [43]:
val_step(None, batch)

(['21.1', '1.1', 'TA  AT', 'TA )'],
 ['21.20', '14.80', "CUSTOMER'S COPY", 'TOTAL AMT INCL GST @ 6% :'])

# Ignite

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [31]:
_ = model.to(device)

In [32]:
val_loader = dm.val_dataloader()
train_loader = dm.train_dataloader()

In [33]:
MAX_EPOCHS=30
STEPS = len(train_loader) * MAX_EPOCHS
STEPS

255570

In [34]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=0)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, STEPS, 1e-6)
criterion = torch.nn.CTCLoss(blank=tokenizer.pad_token_id, zero_infinity=True)#, reduction="none")

In [35]:
def get_preds_from_logits(logits, attention_image, labels):
    decoded_ids = logits.argmax(-1).squeeze(0)
    if len(decoded_ids.shape) == 1:
        decoded_ids = decoded_ids.unsqueeze(0)
    decoded = [
        decoder(dec, att) for dec, att in zip(decoded_ids, attention_image)
    ]
    y_pred = tokenizer.batch_decode(decoded, skip_special_tokens=True)
    y = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return y_pred, y

In [36]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()

    images, labels, attention_mask, attention_image = [x.to(device) for x in batch]

    images_embedding = model.visual_model(images)
    logits = model.rec_model(images_embedding, attention_mask=attention_image)

    input_length = attention_image.sum(-1)
    target_length = attention_mask.sum(-1)

    logits = logits.permute(1, 0, 2)
    logits = logits.log_softmax(2)

    loss = criterion(logits, labels, input_length, target_length)
    loss.backward()

    # cnn_lm_logits = model.cnn_lm(images_embedding)
    # cnn_lm_logits = cnn_lm_logits.permute(1, 0, 2)
    # cnn_lm_logits = cnn_lm_logits.log_softmax(2)
    # loss_cnn = 0.5 * criterion(cnn_lm_logits, labels, input_length, target_length)
    # loss_cnn.backward(retain_graph=True)

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    lr_scheduler.step()
    return loss.item()

In [37]:
def val_step(engine, batch):
    model.eval()
    images, labels, attention_mask, attention_image = [x.to(device) for x in batch]
    with torch.no_grad():
        logits = model(images, attention_image)

    y_pred, y = get_preds_from_logits(logits, attention_image, labels)
    return y_pred, y
    

In [38]:
def log_validation_results(engine):
    validation_evaluator.run(val_loader)
    metrics = validation_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_f1 = metrics['f1']
    print(f"Validation Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.3f} Avg F1: {avg_f1:.3f}")

In [39]:
trainer = Engine(train_step)
train_evaluator = Engine(val_step)
validation_evaluator = Engine(val_step)

In [40]:
trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

ExactMatch().attach(train_evaluator, "accuracy")
ExactMatch().attach(validation_evaluator, "accuracy")
WordF1().attach(train_evaluator, "f1")
WordF1().attach(validation_evaluator, "f1")

In [71]:
!rm -rf roberta-checkpoint-models

In [72]:
to_save = {'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'trainer': trainer}
gst = lambda *_: trainer.state.epoch
handler = Checkpoint(
    to_save, 
    'roberta-checkpoint-models', 
    n_saved=1, 
    global_step_transform=gst,
)
trainer.add_event_handler(Events.EPOCH_COMPLETED, handler)

<ignite.engine.events.RemovableEventHandle at 0x7f3d581f6bf0>

In [73]:
to_save = {'model': model}
handler = Checkpoint(
    to_save, 
    "roberta-checkpoint-models",
    n_saved=1, 
    filename_prefix='best',
    score_name="accuracy",
    global_step_transform=global_step_from_engine(trainer)
)
validation_evaluator.add_event_handler(Events.COMPLETED, handler)

<ignite.engine.events.RemovableEventHandle at 0x7f3d582d6350>

In [74]:
neptune_logger = NeptuneLogger(
    project="i155825/OCRMsc",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhOGUyY2VlOS1hZTU5LTQ2NGQtYTY5Zi04OGJmZWM2M2NlMDAifQ==",
)

neptune_logger.attach_output_handler(
    trainer,
    event_name=Events.ITERATION_COMPLETED,
    tag="training",
    output_transform=lambda loss: {"loss": loss},
)

neptune_logger.attach_output_handler(
    validation_evaluator,
    event_name=Events.EPOCH_COMPLETED,
    tag="validation",
    metric_names=["f1", "accuracy"],
    global_step_transform=global_step_from_engine(trainer),  
)

neptune_logger["code"].upload_files([f"{CODE_PATH}/*.py"])

https://app.neptune.ai/i155825/OCRMsc/e/OC-56


In [41]:
pbar = ProgressBar()
pbar.attach(trainer, output_transform=lambda x: {'loss': x})

In [42]:
trainer.run(train_loader, max_epochs=MAX_EPOCHS)

Epoch [1/30]: [437/8519]   5%|▌         , loss=1.87 [00:33<09:47]Engine run is terminating due to exception: 


KeyboardInterrupt: 

Epoch [1/30]: [438/8519]   5%|▌         , loss=1.87 [00:50<09:47]

# Test

In [39]:
import collections
import os

from tqdm.auto import tqdm

from datamodule import TestSROIETask2DataModule

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [41]:
_ = model.to(device)

In [42]:
!unzip {DATA_PATH}/testdata.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: testdata/X51005684949.jpg__21.png  
  inflating: __MACOSX/testdata/._X51005684949.jpg__21.png  
  inflating: testdata/X51005568889.jpg__42.png  
  inflating: __MACOSX/testdata/._X51005568889.jpg__42.png  
  inflating: testdata/X51008042787.jpg__31.png  
  inflating: __MACOSX/testdata/._X51008042787.jpg__31.png  
  inflating: testdata/X51006502540.jpg__21.png  
  inflating: __MACOSX/testdata/._X51006502540.jpg__21.png  
  inflating: testdata/X51005724611.jpg__47.png  
  inflating: __MACOSX/testdata/._X51005724611.jpg__47.png  
  inflating: testdata/X00016469670.jpg__17.png  
  inflating: __MACOSX/testdata/._X00016469670.jpg__17.png  
  inflating: testdata/X51006466070.jpg__7.png  
  inflating: __MACOSX/testdata/._X51006466070.jpg__7.png  
  inflating: testdata/X51005724628.jpg__64.png  
  inflating: __MACOSX/testdata/._X51005724628.jpg__64.png  
  inflating: testdata/X51006555819.jpg__55.png  
  inflating: __M

In [43]:
dm = TestSROIETask2DataModule(
    root_dir="testdata",
    label_file=os.path.join(DATA_PATH, "testdata.json"),
    tokenizer=tokenizer,
    height=32,
    num_workers=4,
    train_bs=16,
    valid_bs=64,
    val_pct=0.1,
    max_width=None,
    do_pool=True,
    pooler_mode=POOLER_MODE
)

In [44]:
dm.setup("fit")

In [45]:
test_loader = dm.test_dataloader()

In [46]:
handler.last_checkpoint

PosixPath('roberta-checkpoint-models/best_model_21_accuracy=0.9416.pt')

In [47]:
# ckpt_path = "/content/drive/MyDrive/Mestrado/Dev/devcodes/deberta/roberta-checkpoint-models/best_model_17_accuracy=0.8460.pt"
state_dict = torch.load(handler.last_checkpoint)
# state_dict = torch.load(ckpt_path, map_location="cpu")
model.load_state_dict(state_dict)

<All keys matched successfully>

In [48]:
batch_size = dm.valid_bs
img2label = dm.img2label

In [49]:
test_results = collections.defaultdict(list)
for i, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
    y_pred, y = val_step(None, batch[:-1])

    names = batch[-1]

    for p, n in zip(y_pred, names):
        n = n.split(".")[0]
        test_results[n].extend(p.strip().split())

  0%|          | 0/303 [00:00<?, ?it/s]

In [None]:
# model.eval()
# test_results = collections.defaultdict(list)
# for i, batch in tqdm(enumerate(test_loader), total=len(test_loader)):

#     images, labels, attention_mask, attention_image = [x.to(device) for x in batch[:-1]]
#     with torch.no_grad():
#         logits = model(images, attention_image)
#     decoded_ids = logits.argmax(-1).squeeze(0)
#     if len(decoded_ids.shape) == 1:
#         decoded_ids = decoded_ids.unsqueeze(0)
#     decoded = [
#         decoder(dec, att) for dec, att in zip(decoded_ids, attention_image)
#     ]
#     decoded = [
#         [d for d in dd if d.item() > 1] for dd in decoded
#     ]
#     y_pred = tokenizer.batch_decode(decoded, skip_special_tokens=True)
#     y = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     names = batch[-1]

#     for p, n in zip(y_pred, names):
#         n = n.split(".")[0]
#         test_results[n].extend(p.strip().split())

    # break

In [50]:
dir_path = "testsroie"
os.makedirs(dir_path, exist_ok=True)
for key, values in test_results.items():
    with open(f"{dir_path}/{key}.txt", "w") as f:
        f.write("\n".join(values))

In [51]:
! cd {dir_path} && zip -r sub.zip *.txt && mv sub.zip ../

  adding: X00016469670.txt (deflated 39%)
  adding: X00016469671.txt (deflated 37%)
  adding: X51005200931.txt (deflated 42%)
  adding: X51005230605.txt (deflated 36%)
  adding: X51005230616.txt (deflated 38%)
  adding: X51005230621.txt (deflated 40%)
  adding: X51005230648.txt (deflated 39%)
  adding: X51005230657.txt (deflated 41%)
  adding: X51005230659.txt (deflated 34%)
  adding: X51005268275.txt (deflated 42%)
  adding: X51005268408.txt (deflated 34%)
  adding: X51005288570.txt (deflated 32%)
  adding: X51005301666.txt (deflated 43%)
  adding: X51005337867.txt (deflated 38%)
  adding: X51005337877.txt (deflated 39%)
  adding: X51005361906.txt (deflated 42%)
  adding: X51005361908.txt (deflated 41%)
  adding: X51005361912.txt (deflated 40%)
  adding: X51005361923.txt (deflated 33%)
  adding: X51005365187.txt (deflated 42%)
  adding: X51005433518.txt (deflated 38%)
  adding: X51005433543.txt (deflated 38%)
  adding: X51005433548.txt (deflated 39%)
  adding: X51005433556.txt (deflat