<a href="https://colab.research.google.com/github/joooser/COMPRA-VENTA/blob/cedula-OCR/Copy_of_Final_Finetune_donut_(for_your_own_dataset).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets sentencepiece
!pip install -q pytorch-lightning wandb

In [2]:
from google.colab import drive
import os
from datasets import load_dataset
drive.mount('/content/drive')
# can now access folder with: os.listdir("gdrive/MyDrive/preparedFinetuneData/test")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers==4.25.1



In [4]:
!pip install sconf
!pip install donut-python
!pip install pytorch-lightning #MAKE SURE VERSION IS 2.0.7
!pip install timm==0.5.4
#!pip install transformers
#!pip install pytorch-lightning==1.6.4
#!pip install transformers==4.11.3



In [5]:
# @title
#required functions to train
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
import math
import random
import re
from pathlib import Path

import numpy as np
import pytorch_lightning as pl
import torch
from nltk import edit_distance
from pytorch_lightning.utilities import rank_zero_only
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader

from donut import DonutConfig, DonutModel


class DonutModelPLModule(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.config = config

        if self.config.get("pretrained_model_name_or_path", False):
            self.model = DonutModel.from_pretrained(
                self.config.pretrained_model_name_or_path,
                input_size=self.config.input_size,
                max_length=self.config.max_length,
                align_long_axis=self.config.align_long_axis,
                ignore_mismatched_sizes=True,
            )
        else:
            self.model = DonutModel(
                config=DonutConfig(
                    input_size=self.config.input_size,
                    max_length=self.config.max_length,
                    align_long_axis=self.config.align_long_axis,
                    # with DonutConfig, the architecture customization is available, e.g.,
                    # encoder_layer=[2,2,14,2], decoder_layer=4, ...
                )
            )
        self.pytorch_lightning_version_is_1 = int(pl.__version__[0]) < 2
        self.num_of_loaders = len(self.config.dataset_name_or_paths)

    def training_step(self, batch, batch_idx):
        image_tensors, decoder_input_ids, decoder_labels = list(), list(), list()
        for batch_data in batch:
            image_tensors.append(batch_data[0])
            decoder_input_ids.append(batch_data[1][:, :-1])
            decoder_labels.append(batch_data[2][:, 1:])
        image_tensors = torch.cat(image_tensors)
        decoder_input_ids = torch.cat(decoder_input_ids)
        decoder_labels = torch.cat(decoder_labels)
        loss = self.model(image_tensors, decoder_input_ids, decoder_labels)[0]
        self.log_dict({"train_loss": loss}, sync_dist=True)
        if not self.pytorch_lightning_version_is_1:
            self.log('loss', loss, prog_bar=True)
        return loss

    def on_validation_epoch_start(self) -> None:
        super().on_validation_epoch_start()
        self.validation_step_outputs = [[] for _ in range(self.num_of_loaders)]
        return

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        image_tensors, decoder_input_ids, prompt_end_idxs, answers = batch
        decoder_prompts = pad_sequence(
            [input_id[: end_idx + 1] for input_id, end_idx in zip(decoder_input_ids, prompt_end_idxs)],
            batch_first=True,
        )

        preds = self.model.inference(
            image_tensors=image_tensors,
            prompt_tensors=decoder_prompts,
            return_json=False,
            return_attentions=False,
        )["predictions"]

        scores = list()
        for pred, answer in zip(preds, answers):
            pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
            answer = re.sub(r"<.*?>", "", answer, count=1)
            answer = answer.replace(self.model.decoder.tokenizer.eos_token, "")
            scores.append(edit_distance(pred, answer) / max(len(pred), len(answer)))

            if self.config.get("verbose", False) and len(scores) == 1:
                self.print(f"Prediction: {pred}")
                self.print(f"    Answer: {answer}")
                self.print(f" Normed ED: {scores[0]}")

        self.validation_step_outputs[dataloader_idx].append(scores)

        return scores

    def on_validation_epoch_end(self):
        assert len(self.validation_step_outputs) == self.num_of_loaders
        cnt = [0] * self.num_of_loaders
        total_metric = [0] * self.num_of_loaders
        val_metric = [0] * self.num_of_loaders
        for i, results in enumerate(self.validation_step_outputs):
            for scores in results:
                cnt[i] += len(scores)
                total_metric[i] += np.sum(scores)
            val_metric[i] = total_metric[i] / cnt[i]
            val_metric_name = f"val_metric_{i}th_dataset"
            self.log_dict({val_metric_name: val_metric[i]}, sync_dist=True)
        self.log_dict({"val_metric": np.sum(total_metric) / np.sum(cnt)}, sync_dist=True)

    def configure_optimizers(self):

        max_iter = None

        if int(self.config.get("max_epochs", -1)) > 0:
            assert len(self.config.train_batch_sizes) == 1, "Set max_epochs only if the number of datasets is 1"
            max_iter = (self.config.max_epochs * self.config.num_training_samples_per_epoch) / (
                self.config.train_batch_sizes[0] * torch.cuda.device_count() * self.config.get("num_nodes", 1)
            )

        if int(self.config.get("max_steps", -1)) > 0:
            max_iter = min(self.config.max_steps, max_iter) if max_iter is not None else self.config.max_steps

        assert max_iter is not None
        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.lr)
        scheduler = {
            "scheduler": self.cosine_scheduler(optimizer, max_iter, self.config.warmup_steps),
            "name": "learning_rate",
            "interval": "step",
        }
        return [optimizer], [scheduler]

    @staticmethod
    def cosine_scheduler(optimizer, training_steps, warmup_steps):
        def lr_lambda(current_step):
            if current_step < warmup_steps:
                return current_step / max(1, warmup_steps)
            progress = current_step - warmup_steps
            progress /= max(1, training_steps - warmup_steps)
            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))

        return LambdaLR(optimizer, lr_lambda)

    @rank_zero_only
    def on_save_checkpoint(self, checkpoint):
        save_path = Path(self.config.result_path) / self.config.exp_name / self.config.exp_version
        self.model.save_pretrained(save_path)
        self.model.decoder.tokenizer.save_pretrained(save_path)


class DonutDataPLModule(pl.LightningDataModule):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.train_batch_sizes = self.config.train_batch_sizes
        self.val_batch_sizes = self.config.val_batch_sizes
        self.train_datasets = []
        self.val_datasets = []
        self.g = torch.Generator()
        self.g.manual_seed(self.config.seed)

    def train_dataloader(self):
        loaders = list()
        for train_dataset, batch_size in zip(self.train_datasets, self.train_batch_sizes):
            loaders.append(
                DataLoader(
                    train_dataset,
                    batch_size=batch_size,
                    num_workers=self.config.num_workers,
                    pin_memory=True,
                    worker_init_fn=self.seed_worker,
                    generator=self.g,
                    shuffle=True,
                )
            )
        return loaders

    def val_dataloader(self):
        loaders = list()
        for val_dataset, batch_size in zip(self.val_datasets, self.val_batch_sizes):
            loaders.append(
                DataLoader(
                    val_dataset,
                    batch_size=batch_size,
                    pin_memory=True,
                    shuffle=False,
                )
            )
        return loaders

    @staticmethod
    def seed_worker(wordker_id):
        worker_seed = torch.initial_seed() % 2 ** 32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
CONFIG_NAME = "train_cord_finetune.yaml"
CONFIG_PATH = f"/content/drive/MyDrive/preparedFinetuneData/train_cord_finetune.yaml" #file uploaded to google drive


In [8]:
#train.py file from Donut
"""
Donut
Copyright (c) 2022-present NAVER Corp.
MIT License
"""
import argparse
import datetime
import json
import os
import random
from io import BytesIO
from os.path import basename
from pathlib import Path
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from pytorch_lightning.plugins import CheckpointIO
from pytorch_lightning.utilities import rank_zero_only
from sconf import Config

from donut import DonutDataset
#from lightning_module import DonutDataPLModule, DonutModelPLModule #these are now declared above


class CustomCheckpointIO(CheckpointIO):
    def save_checkpoint(self, checkpoint, path, storage_options=None):
        if (checkpoint.get("state_dict", None) is not None):
          del checkpoint["state_dict"]
        torch.save(checkpoint, path)

    def load_checkpoint(self, path, storage_options=None):
        checkpoint = torch.load(path + "artifacts.ckpt")
        state_dict = torch.load(path + "pytorch_model.bin")
        checkpoint["state_dict"] = {"model." + key: value for key, value in state_dict.items()}
        return checkpoint

    def remove_checkpoint(self, path) -> None:
        return super().remove_checkpoint(path)


@rank_zero_only
def save_config_file(config, path):
    if not Path(path).exists():
        os.makedirs(path)
    save_path = Path(path) / "config.yaml"
    print(config.dumps())
    with open(save_path, "w") as f:
        f.write(config.dumps(modified_color=None, quote_str=True))
        print(f"Config is saved at {save_path}")


class ProgressBar(pl.callbacks.TQDMProgressBar):
    def __init__(self, config):
        super().__init__()
        self.enable = True
        self.config = config

    def disable(self):
        self.enable = False

    def get_metrics(self, trainer, model):
        items = super().get_metrics(trainer, model)
        items.pop("v_num", None)
        items["exp_name"] = f"{self.config.get('exp_name', '')}"
        items["exp_version"] = f"{self.config.get('exp_version', '')}"
        return items


def set_seed(seed):
    pytorch_lightning_version = int(pl.__version__[0])
    if pytorch_lightning_version < 2:
        pl.utilities.seed.seed_everything(seed, workers=True)
    else:
        import lightning_fabric
        lightning_fabric.utilities.seed.seed_everything(seed, workers=True)


def train(config):
    set_seed(config.get("seed", 42))

    model_module = DonutModelPLModule(config)
    data_module = DonutDataPLModule(config)

    # add datasets to data_module
    datasets = {"train": [], "validation": []}
    for i, dataset_name_or_path in enumerate(config.dataset_name_or_paths):
        task_name = os.path.basename(dataset_name_or_path)  # e.g., cord-v2, docvqa, rvlcdip, ...

        # add categorical special tokens (optional)
        if task_name == "rvlcdip":
            model_module.model.decoder.add_special_tokens([
                "<advertisement/>", "<budget/>", "<email/>", "<file_folder/>",
                "<form/>", "<handwritten/>", "<invoice/>", "<letter/>",
                "<memo/>", "<news_article/>", "<presentation/>", "<questionnaire/>",
                "<resume/>", "<scientific_publication/>", "<scientific_report/>", "<specification/>"
            ])
        if task_name == "docvqa":
            model_module.model.decoder.add_special_tokens(["<yes/>", "<no/>"])

        for split in ["train", "validation"]:
            datasets[split].append(
                DonutDataset(
                    dataset_name_or_path= f"/content/drive/MyDrive/preparedFinetuneData",
                    donut_model=model_module.model,
                    max_length=config.max_length,
                    split=split,
                    task_start_token=config.task_start_tokens[i]
                    if config.get("task_start_tokens", None)
                    else f"<s_{task_name}>",
                    prompt_end_token="<s_answer>" if "docvqa" in dataset_name_or_path else f"<s_{task_name}>",
                    sort_json_key=config.sort_json_key,
                )
            )
            # prompt_end_token is used for ignoring a given prompt in a loss function
            # for docvqa task, i.e., {"question": {used as a prompt}, "answer": {prediction target}},
            # set prompt_end_token to "<s_answer>"

    data_module.train_datasets = datasets["train"]
    data_module.val_datasets = datasets["validation"]

    logger = TensorBoardLogger(
        save_dir=config.result_path,
        name=config.exp_name,
        version=config.exp_version,
        default_hp_metric=False,
    )

    lr_callback = LearningRateMonitor(logging_interval="step")

    checkpoint_callback = ModelCheckpoint(
        monitor="val_metric",
        dirpath=Path(config.result_path) / config.exp_name / config.exp_version,
        filename="artifacts",
        save_top_k=1,
        save_last=False,
        mode="min",
    )

    bar = ProgressBar(config)

    custom_ckpt = CustomCheckpointIO()
    trainer = pl.Trainer(
        num_nodes=config.get("num_nodes", 1),
        devices=torch.cuda.device_count(),
        #strategy="ddp", #NOTE must comment out strategy, if not you get error
        accelerator="gpu",
        plugins=custom_ckpt,
        max_epochs=config.max_epochs,
        max_steps=config.max_steps,
        val_check_interval=config.val_check_interval,
        check_val_every_n_epoch=config.check_val_every_n_epoch,
        gradient_clip_val=config.gradient_clip_val,


        precision=16,
        num_sanity_val_steps=0,
        logger=logger,
        callbacks=[lr_callback, checkpoint_callback, bar],
    )

    trainer.fit(model_module, data_module, ckpt_path=config.get("resume_from_checkpoint_path", None))
    trainer.save_checkpoint(f"{Path(config.result_path)}/{config.exp_name}/{config.exp_version}/model_checkpoint.ckpt")


if __name__ == "__main__":

    config = Config(CONFIG_PATH)
    config.argv_update([])

    config.exp_name = basename(CONFIG_NAME).split(".")[0]
    config.exp_version = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

    config.dataset_name_or_paths = ["drive/MyDrive/preparedFinetuneData"]
    config.train_batch_sizes = [1]
    config.check_val_every_n_epoch = 1
    config.max_steps = 500 # infinite, since max_epochs is specified
    config.num_workers = 1 #recommended max
    config.num_nodes = 1
    config.pretrained_model_name_or_path= "naver-clova-ix/donut-base-finetuned-cord-v2" #donut finetuned on Cord dataset
    config.warmup_steps = 0
    print("config path:" , Path(config.result_path),config.exp_name, config.exp_version)
    save_config_file(config, Path(config.result_path) / config.exp_name / config.exp_version)
    train(config)


INFO:lightning_fabric.utilities.seed:Seed set to 2022


config path: result train_cord_finetune 20240415_011111
resume_from_checkpoint_path: None
result_path: ./result
pretrained_model_name_or_path: naver-clova-ix/donut-base-finetuned-cord-v2
dataset_name_or_paths: 
  - drive/MyDrive/preparedFinetuneData
sort_json_key: False
train_batch_sizes: 
  - 1
val_batch_sizes: 
  - 1
input_size: 
  - 1280
  - 960
max_length: 768
align_long_axis: False
num_nodes: 1
seed: 2022
lr: 3e-05
warmup_steps: 0
num_training_samples_per_epoch: 800
max_epochs: 3
max_steps: 500
num_workers: 1
val_check_interval: 1.0
check_val_every_n_epoch: 1
gradient_clip_val: 1.0
verbose: True
exp_name: train_cord_finetune
exp_version: 20240415_011111
Config is saved at result/train_cord_finetune/20240415_011111/config.yaml


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Resolving data files:   0%|          | 0/115 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/68 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/115 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/68 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /content/result/train_cord_finetune/20240415_011111 exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCA

Training: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
 Normed ED: 0.0
Prediction: <s_cedula>V 15.913.122</s_cedula><s_apellidos>TORRES BLANCO</s_apellidos><s_nombres>JOSE ALFREDO</s_nombres><s_f_nacimiento>17/12/1982</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>8/8/2012</s_f_expedicion><s_f_vencimiento>ago-22</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
 Normed ED: 0.0
Prediction: <s_cedula>V 15.913.122</s_cedula><s_apellidos>TORRES BLANCO</s_apellidos><s_nombres>JOSE ALFREDO</s_nombres><s_f_nacimiento>17/12/1982</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>8/8/2012</s_f_expedicion><s_f_vencimiento>ago-22</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <

Validation: |          | 0/? [00:00<?, ?it/s]

Prediction: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <s_cedula>V 9.913.692</s_cedula><s_apellidos>ROSA CARRASQUEL</s_apellidos><s_nombres>HENRY ADOLFO</s_nombres><s_f_nacimiento>15/4/1969</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>25/3/2009</s_f_expedicion><s_f_vencimiento>mar-19</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
 Normed ED: 0.0
Prediction: <s_cedula>V 15.913.122</s_cedula><s_apellidos>TORRES BLANCO</s_apellidos><s_nombres>JOSE ALFREDO</s_nombres><s_f_nacimiento>17/12/1982</s_f_nacimiento><s_edo_civil>SOLTERO</s_edo_civil><s_f_expedicion>8/8/2012</s_f_expedicion><s_f_vencimiento>ago-22</s_f_vencimiento><s_nacionalidad>VENEZOLANO</s_nacionalidad>
    Answer: <

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


inferencia


In [26]:
from PIL import Image
# Change the path here:
model = DonutModel.from_pretrained("/content/result/train_cord_finetune/20240415_011111")
if torch.cuda.is_available():
    model.half()
    device = torch.device("cuda")
    model.to(device)
else:
    model.encoder.to(torch.bfloat16)

model.eval()

image = Image.open("/content/drive/MyDrive/CI-PRUEBA/4.jpeg") # Change here
with torch.no_grad():
  prompt_tensors = torch.tensor([1, 2, 3])
  output = model.inference(image=image, prompt_tensors=prompt_tensors)

output

{'predictions': [{'apellidos': 'DA SILVA MINANO',
   'nombres': 'PABLO ARTURO',
   'f_nacimiento': '29/6/1990',
   'edo_civil': 'SOLTERO',
   'f_expedicion': '13/7/2017',
   'f_vencimiento': 'jul-27',
   'nacionalidad': 'VENEZOLANO'}]}