In [2]:
!pip install -U datasets transformers pyarrow polyleven -q

[0m

In [2]:
import os
import re
import json
from collections import Counter
from itertools import chain
from pathlib import Path
from typing import List, Dict, Union, Tuple, Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoProcessor,
    Pix2StructConfig,
    Pix2StructForConditionalGeneration,
    get_linear_schedule_with_warmup,
)
import albumentations as A
from albumentations.pytorch import ToTensorV2
from datasets import Image as ds_img
from polyleven import levenshtein # a faster version of levenshtein
import cv2
from torch.cuda.amp import GradScaler, autocast

In [5]:
data_dir = Path("/kaggle/input/benetech-making-graphs-accessible/train")
images_path = data_dir / "images"
train_json_files = list((data_dir / "annotations").glob("*.json"))

class CFG:

    # General
    debug = False
    num_proc = 2
    num_workers = 2
    gpus = 1
    max_patch = 1024
    # Data
    max_length = 512
    image_height = 256
    image_width = 256

    # Training
    epochs = 2
    val_check_interval = 1.0  # how many times we want to validate during an epoch
    check_val_every_n_epoch = 1
    gradient_clip_val = 1.0
    lr = 3e-5
    lr_scheduler_type = "cosine"
    num_warmup_steps = 100
    seed = 42
    warmup_steps = 300  
    output_path = "/content/output"
    log_steps = 200
    batch_size = 2
    use_wandb = False

In [6]:
PROMPT_TOKEN = "<|BOS|>"
X_START = "<x_start>"
X_END = "<x_end>"
Y_START = "<y_start>"
Y_END = "<y_end>"
# PROMPT_END_TOKEN = "</|PROMPT|>"

SEPARATOR_TOKENS = [
    PROMPT_TOKEN,
    X_START,
    X_END,
    Y_START,
    Y_END,
#     PROMPT_END_TOKEN
]

LINE_TOKEN =  "<line>" 
VERTICAL_BAR_TOKEN = "<vertical_bar>"
# HORIZONTAL_BAR_TOKEN = "<horizontal_bar>"
# SCATTER_TOKEN = "<scatter>"
DOT_TOKEN = "<dot>"

CHART_TYPE_TOKENS = [
    LINE_TOKEN,
    VERTICAL_BAR_TOKEN,
    # HORIZONTAL_BAR_TOKEN,
    # SCATTER_TOKEN,
    DOT_TOKEN
]

new_tokens = SEPARATOR_TOKENS + CHART_TYPE_TOKENS

In [7]:
new_tokens

['<|BOS|>',
 '<x_start>',
 '<x_end>',
 '<y_start>',
 '<y_end>',
 '<line>',
 '<vertical_bar>',
 '<dot>']

In [8]:
def is_nan(value: Union[int, float, str]) -> bool:
     return isinstance(value, float) and str(value) == "nan"

def round_float(value: Union[int, float, str]) -> Union[str, float]:
    if isinstance(value, float):
        value = str(value)

        if "." in value:
            integer, decimal = value.split(".")
            if abs(float(integer)) > 1:
                decimal = decimal[:1]
            else:
                decimal = decimal[:4]

            value = integer + "." + decimal
    return value

def get_gt_string_and_xy(filepath: Union[str, os.PathLike]) -> Dict[str, str]:
    filepath = Path(filepath)

    with open(filepath) as fp:
        data = json.load(fp)

    data_series = data["data-series"]

    all_x, all_y = [], []

    for d in data_series:
        x = d["x"]
        y = d["y"]

        x = round_float(x)
        y = round_float(y)

        # Ignore nan values
        if is_nan(x) or is_nan(y):
            continue

        all_x.append(x)
        all_y.append(y)
        
    
    if data['chart-type'] in ['horizontal_bar','scatter']:
       return None
    
    chart_type = f"<{data['chart-type']}>"
    x_str = X_START + ";".join(list(map(str, all_x))) + X_END
    y_str = Y_START + ";".join(list(map(str, all_y))) + Y_END
    
    gt_string = PROMPT_TOKEN + chart_type + x_str + y_str

    return {
        "ground_truth": gt_string,
        "x": json.dumps(all_x),
        "y": json.dumps(all_y),
        "chart-type": data["chart-type"],
        "id": filepath.stem,
        "source": data["source"],
    }

In [9]:
processor = AutoProcessor.from_pretrained('hoangphu7122002ai/pix2struct_v0',is_vqa=False)
model = Pix2StructForConditionalGeneration.from_pretrained('hoangphu7122002ai/pix2struct_v0')
processor.image_processor.size = {
    "height": CFG.image_height,
    "width": CFG.image_width,
}


Downloading (…)rocessor_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.27M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.91k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

In [10]:
processor.tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(processor.tokenizer))

Embedding(50352, 768)

In [11]:
model.config.text_config.is_decoder=True

In [12]:
from tqdm import tqdm

ds = []
for f in tqdm(train_json_files):
    res = get_gt_string_and_xy(f)
    if res is None:
       continue
    row = {
        **res,
        "image_path": str(images_path / f"{f.stem}.jpg"),
    }
    ds.append(row)

100%|██████████| 60578/60578 [03:52<00:00, 260.22it/s]


In [13]:
ds[0]

{'ground_truth': '<|BOS|><line><x_start>10%;20%;30%;40%;50%;60%;70%<x_end><y_start>1.5481;11.5;14.0;12.2;7.9;1.7140;5.7<y_end>',
 'x': '["10%", "20%", "30%", "40%", "50%", "60%", "70%"]',
 'y': '["1.5481", "11.5", "14.0", "12.2", "7.9", "1.7140", "5.7"]',
 'chart-type': 'line',
 'id': 'cc68f19b708c',
 'source': 'extracted',
 'image_path': '/kaggle/input/benetech-making-graphs-accessible/train/images/cc68f19b708c.jpg'}

In [14]:
type(ds)

list

In [15]:
split = 0.90
train_samples = int(len(ds) * split)
train_ds = ds[:train_samples+1]
valid_ds = ds[train_samples:]

In [16]:
type(train_ds)

list

In [17]:
def augments():
    return A.Compose([
        A.Resize(width=CFG.image_width, height=CFG.image_height),
        A.Normalize(
            mean=[0, 0, 0],
            std=[1, 1, 1],
            max_pixel_value=255,
        ),
        ToTensorV2(),
    ])

In [18]:
class BeneTechDataset(Dataset):
    def __init__(self, dataset, processor, augments=None):
        self.dataset = dataset
        self.processor = processor
        self.augments = augments

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = cv2.imread(item['image_path'])
        if self.augments:
            image = self.augments(image=image)['image']
        encoding = self.processor(
            images=image,
            return_tensors="pt", 
            add_special_tokens=True, 
            max_patches=CFG.max_patch
        )
        
        encoding = {k:v.squeeze() for k,v in encoding.items()}
        encoding["text"] = item["ground_truth"]
        return encoding

In [19]:
def collator(batch):
    new_batch = {"flattened_patches":[], "attention_mask":[]}
    texts = [item["text"] for item in batch]
    text_inputs = processor(
        text=texts, 
        padding="max_length", 
        truncation=True, 
        return_tensors="pt", 
        add_special_tokens=True, 
        max_length=CFG.max_length
    )
    new_batch["labels"] = text_inputs.input_ids
    for item in batch:
        new_batch["flattened_patches"].append(item["flattened_patches"])
        new_batch["attention_mask"].append(item["attention_mask"])
    new_batch["flattened_patches"] = torch.stack(new_batch["flattened_patches"])
    new_batch["attention_mask"] = torch.stack(new_batch["attention_mask"])

    return new_batch

In [20]:
train_dataset = BeneTechDataset(train_ds, processor, augments=augments())
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=CFG.batch_size, collate_fn=collator)

valid_dataset = BeneTechDataset(valid_ds, processor, augments=augments())
valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=CFG.batch_size, collate_fn=collator)

In [21]:
train_dataset[0]

{'flattened_patches': tensor([[ 1.0000,  1.0000,  0.2292,  ...,  0.2292,  0.2292,  0.2292],
         [ 1.0000,  2.0000,  0.2292,  ...,  0.2292,  0.2292,  0.2292],
         [ 1.0000,  3.0000,  0.2292,  ...,  0.1643,  0.1643,  0.1643],
         ...,
         [32.0000, 30.0000,  0.2292,  ...,  0.2292,  0.2292,  0.2292],
         [32.0000, 31.0000, -1.0661,  ...,  0.2292,  0.2292,  0.2292],
         [32.0000, 32.0000, -0.3547,  ...,  0.2292,  0.2292,  0.2292]]),
 'attention_mask': tensor([1., 1., 1.,  ..., 1., 1., 1.]),
 'text': '<|BOS|><line><x_start>10%;20%;30%;40%;50%;60%;70%<x_end><y_start>1.5481;11.5;14.0;12.2;7.9;1.7140;5.7<y_end>'}

In [22]:
for batch in train_dataloader:
    print(batch)
    break

{'flattened_patches': tensor([[[ 1.0000,  1.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421],
         [ 1.0000,  2.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421],
         [ 1.0000,  3.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421],
         ...,
         [32.0000, 30.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421],
         [32.0000, 31.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421],
         [32.0000, 32.0000,  0.1421,  ...,  0.1421,  0.1421,  0.1421]],

        [[ 1.0000,  1.0000, -1.0359,  ..., -0.9233, -0.9233, -0.9233],
         [ 1.0000,  2.0000, -0.9796,  ..., -0.8880, -0.8880, -0.8880],
         [ 1.0000,  3.0000, -0.9233,  ..., -0.8387, -0.8387, -0.8387],
         ...,
         [32.0000, 30.0000, -0.8387,  ..., -0.9233, -0.9233, -0.9233],
         [32.0000, 31.0000, -0.8880,  ..., -0.9796, -0.9796, -0.9796],
         [32.0000, 32.0000, -0.9514,  ..., -1.0078, -1.0078, -1.0078]]]), 'attention_mask': tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 

**Model**

**Training**

In [23]:
def train_one_epoch(model, processor, train_loader, optimizer, scaler):
    """
    Trains the model on all batches for one epoch with NVIDIA's AMP
    """
    model.train()
    avg_loss = 0
    with autocast():
        prog_bar = tqdm(enumerate(train_loader), total=len(train_loader))
        for idx, batch in prog_bar:
            labels = batch.pop("labels").to('cuda')
            flattened_patches = batch.pop("flattened_patches").to('cuda')
            attention_mask = batch.pop("attention_mask").to('cuda')

            outputs = model(
                flattened_patches=flattened_patches,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            prog_bar.set_description(f"loss: {loss.item():.4f}")
            wandb_log(train_step_loss=loss.item())
            avg_loss += loss.item()
            
    avg_loss = avg_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")
    wandb_log(train_loss=avg_loss)
    return avg_loss

@torch.no_grad()
def valid_one_epoch(model, processor, valid_loader):
    """
    Validates the model on all batches (in val set) for one epoch
    """
    model.eval()
    avg_loss = 0
    prog_bar = tqdm(enumerate(valid_loader), total=len(valid_loader))
    for idx, batch in prog_bar:
        labels = batch.pop("labels").to('cuda')
        flattened_patches = batch.pop("flattened_patches").to('cuda')
        attention_mask = batch.pop("attention_mask").to('cuda')
        
        outputs = model(
            flattened_patches=flattened_patches,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        prog_bar.set_description(f"loss: {loss.item():.4f}")
        wandb_log(val_step_loss=loss.item())
        avg_loss += loss.item()
        
    avg_loss = avg_loss / len(valid_loader)
    print(f"Average validation loss: {avg_loss:.4f}")
    wandb_log(val_loss=avg_loss)
    return avg_loss

In [24]:
def fit(model, processor, train_loader, valid_loader, optimizer, scaler):
    """
    A nice function that binds it all together and reminds me of Keras days from 2018 :)
    """
    best_val_loss = int(1e+5)
    for epoch in range(CFG.epochs):
        print(f"{'='*20} Epoch: {epoch+1} / {CFG.epochs} {'='*20}")
        _ = train_one_epoch(model, processor, train_loader, optimizer, scaler)
        val_avg_loss = valid_one_epoch(model, processor, valid_loader)
        
        if val_avg_loss < best_val_loss:
            best_val_loss = val_avg_loss
            print(f"Saving best model so far with loss: {best_val_loss:.4f}")
            processor.push_to_hub("hoangphu7122002ai/pix2struct_v0",
                                    commit_message=f"valid best lost")
            model.push_to_hub("hoangphu7122002ai/pix2struct_v0",
                                        commit_message=f"valid best lost")
    print(f"Best model with val_loss: {best_val_loss:.4f}")

In [26]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import wandb

wandb.finish()

def wandb_log(**kwargs):
    for k, v in kwargs.items():
        wandb.log({k: v})

# Start W&B logging
# W&B Login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wb_key = user_secrets.get_secret("donut")

wandb.login(key=wb_key)

run = wandb.init(
    project='pytorch',
    group='multi_modal',
    job_type='train',
)

model.to('cuda')
wandb.watch(model)
optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG.lr)
fit(
        model=model,
        processor=processor,
        train_loader=train_dataloader,
        valid_loader=valid_dataloader,
        optimizer=optimizer,
        scaler=GradScaler(),
    )

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


In [26]:
processor.push_to_hub("hoangphu7122002ai/pix2struct_v0",
                        commit_message=f"valid best lost")
model.push_to_hub("hoangphu7122002ai/pix2struct_v0",
                            commit_message=f"valid best lost")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hoangphu7122002ai/pix2struct_v0/commit/c039b1ca978884a0f703433998bfd0e2898e0e2c', commit_message='valid best lost', commit_description='', oid='c039b1ca978884a0f703433998bfd0e2898e0e2c', pr_url=None, pr_revision=None, pr_num=None)