In [1]:
import sys
sys.path.append("../trainer")

In [2]:
import os

import torch
import torchvision as tv
from transformers import AutoTokenizer

from ignite.engine import (
    Engine,
    Events,
)
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import TensorboardLogger, global_step_from_engine
from ignite.contrib.handlers import ProgressBar
from ignite.contrib.handlers.neptune_logger import NeptuneLogger

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datamodule import SROIETask2DataModule
from model import TransformersEncoderSmall, CNNSmall, OCRModel
from ctc import GreedyDecoder
from igmetrics import ExactMatch, WordF1

In [4]:
tokenizer = AutoTokenizer.from_pretrained("../trainer/tokenizer")
decoder = GreedyDecoder(0)

# Loader

In [5]:
DATA_PATH = "/Users/israelcampiotti/Documents/Github/msc/tmp-master/SROIETask2"
dm = SROIETask2DataModule(
    root_dir=os.path.join(DATA_PATH, "data"),
    label_file=os.path.join(DATA_PATH, "data.json"),
    tokenizer=tokenizer,
    height=32,
    num_workers=4,
    train_bs=2,
    valid_bs=2,
    val_pct=0.001,
    max_width=None,
    do_pool=True,
)

In [6]:
dm.setup("fit")

# Model

In [7]:
vis_model = CNNSmall(vocab_size=tokenizer.vocab_size)
rec_model = TransformersEncoderSmall(vocab_size=tokenizer.vocab_size)
model = OCRModel(vis_model, rec_model)

# Ignite

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [9]:
_ = model.to(device)

In [10]:
val_loader =  dm.val_dataloader()
train_loader = val_loader # dm.train_dataloader()

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CTCLoss(blank=0, zero_infinity=True)

In [12]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()

    images, labels, attention_mask, attention_image = batch

    logits = model(images, attention_image)

    input_length = attention_image.sum(-1)
    target_length = attention_mask.sum(-1)

    logits = logits.permute(1, 0, 2)
    logits = logits.log_softmax(2)

    loss = criterion(logits, labels, input_length, target_length)

    loss.backward()
    optimizer.step()
    return loss.item()

trainer = Engine(train_step)

In [13]:
def val_step(engine, batch):
    model.eval()
    images, labels, attention_mask, attention_image = batch
    with torch.no_grad():
        logits = model(images, attention_image)

    decoded_ids = logits.argmax(-1).squeeze(0)
    if len(decoded_ids.shape) == 1:
        decoded_ids = decoded_ids.unsqueeze(0)
    decoded = [
        decoder(dec, att) for dec, att in zip(decoded_ids, attention_image)
    ]
    y_pred = tokenizer.batch_decode(decoded, skip_special_tokens=True)
    y = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return y_pred, y
    

In [14]:
train_evaluator = Engine(val_step)
validation_evaluator = Engine(val_step)

In [15]:
ExactMatch().attach(train_evaluator, "accuracy")
ExactMatch().attach(validation_evaluator, "accuracy")
WordF1().attach(train_evaluator, "f1")
WordF1().attach(validation_evaluator, "f1")

In [18]:
def log_validation_results(engine):
    validation_evaluator.run(val_loader)
    metrics = validation_evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    print(f"Validation Results - Epoch: {engine.state.epoch}  Avg accuracy: {avg_accuracy:.3f}")

trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

<ignite.engine.events.RemovableEventHandle at 0x28ef440d0>

In [19]:
checkpointer = ModelCheckpoint(dirname='models', filename_prefix='deberta-ocr', n_saved=2, create_dir=True, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': model})

<ignite.engine.events.RemovableEventHandle at 0x147354310>

In [20]:
neptune_logger = NeptuneLogger(
    project="i155825/OCRMsc",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJhOGUyY2VlOS1hZTU5LTQ2NGQtYTY5Zi04OGJmZWM2M2NlMDAifQ==",
)


  import neptune.new as neptune
  run = neptune.init_run(


https://app.neptune.ai/i155825/OCRMsc/e/OC-15


In [21]:
neptune_logger.attach_output_handler(
    trainer,
    event_name=Events.ITERATION_COMPLETED,
    tag="training",
    output_transform=lambda loss: {"loss": loss},
)

neptune_logger.attach_output_handler(
    validation_evaluator,
    event_name=Events.EPOCH_COMPLETED,
    tag="validation",
    metric_names=["f1", "accuracy"],
    global_step_transform=global_step_from_engine(trainer),  
)

neptune_logger["code"].upload_files(["../trainer/*.py"])

In [22]:
pbar = ProgressBar()
pbar.attach(trainer, output_transform=lambda x: {'loss': x})

In [23]:
trainer.run(train_loader, max_epochs=10)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [1/10]: [22/22] 100%|██████████, loss=4.93 [00:04<00:00]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Engine run is terminating due to exception: 
Engine run is terminating due to exception: 


KeyboardInterrupt: 