In [2]:
import os
from functools import partial
from pathlib import Path
import sys
from typing import List

import torch
import pypdf
from tqdm import tqdm

In [3]:
sys.path.append("src/nougat")
from nougat import NougatModel
from nougat.utils.dataset import LazyDataset
from nougat.utils.checkpoint import get_checkpoint
from nougat.postprocessing import markdown_compatible

  alb.ElasticTransform(


In [4]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    nougat_model_dir: str
settings = Settings()

In [5]:
def load_model_to_gpu(model_dir: str, device = "mps") -> NougatModel:
    """Initialize and load Nougat model to specified GPU."""
    print(f"Loading model to device {device}")
    model = NougatModel.from_pretrained(model_dir)
    model.to(device).to(torch.bfloat16)
    model.eval()
    return model

In [6]:
# using nougat 0.1.0-base
model = load_model_to_gpu(settings.nougat_model_dir, device="mps")

Loading model to device mps


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [7]:
# model

# Process PDF

In [12]:
# attention is all you need
sample_fname = "1706.03762v7.pdf"
pdf_path = Path(os.path.join("samples", sample_fname))
output_path = Path(os.path.join("results/nougat", sample_fname.replace(".pdf", "")))
if not os.path.exists(output_path):
    os.makedirs(output_path)
pdf_path, output_path

(PosixPath('samples/1706.03762v7.pdf'), PosixPath('results/1706.03762v7'))

In [9]:
 # get document ID (remove .pdf and use full path structure)
document_id = pdf_path.stem
month_dir = pdf_path.parent.name
print(document_id, month_dir)
try:
    # prepare dataset for all pages
    full_dataset = LazyDataset(
        str(pdf_path), partial(model.encoder.prepare_input, random_padding=False)
    )
except pypdf.errors.PdfStreamError as e:
    print(e)
    
full_dataset

1706.03762v7 samples


<nougat.utils.dataset.LazyDataset at 0x172d6cf40>

In [10]:
batch_size = 8
dataloader = torch.utils.data.DataLoader(
    full_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=LazyDataset.ignore_none_collate,
)

In [13]:
for batch_idx, (sample, is_last_page) in enumerate(tqdm(dataloader, desc=f"Processing {document_id}")):
    with torch.no_grad():
        model_output = model.inference(
            image_tensors=sample.to(torch.bfloat16),
            early_stopping=False
        )
        for j, output in enumerate(model_output["predictions"]):
            page_num = batch_idx * batch_size + j + 1
            formatted_output = markdown_compatible(output.strip())
            save_path = output_path / f"{document_id}_{page_num}.mmd"
            save_path.write_text(formatted_output)

Processing 1706.03762v7: 100%|██████████| 2/2 [05:19<00:00, 159.77s/it]


# Postprocessing

In [21]:
from arxiver.postprocess import (
    read_mmd,
    has_abstract,
    detect_headers,
    find_references,
    remove_authors,
    remove_references
)

In [24]:
page1_mmd = output_path/f"{document_id}_1.mmd"
mmd_content = read_mmd(str(page1_mmd))
print(mmd_content)

# Attention Is All You Need

 Ashish Vaswani

Google Brain

avaswani@google.com

&Noam Shazeer1

Google Brain

noam@google.com

&Niki Parmar1

Google Research

nikip@google.com

&Jakob Uszkoreit1

Google Research

usz@google.com

&Llion Jones1

Google Research

llion@google.com

&Aidan N. Gomez1

University of Toronto

aidan@cs.toronto.edu

&Lukasz Kaiser1

Google Brain

lukaszkaiser@google.com

&Illia Polosukhin1

illia.polosukhin@gmail.com

Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and t

In [18]:
has_abstract(mmd_content)

True

In [19]:
detect_headers(mmd_content)

[(0, '# Attention Is All You Need'), (54, '###### Abstract')]

In [20]:
find_references(mmd_content)

False

In [23]:
'''
Removes author names between title (first) & abstract

def remove_authors(mmd: str) -> str:
    """Remove author names while preserving layout."""
    lines = mmd.splitlines()
    abstract_line = 0
    for i, line in enumerate(lines):
        if line.startswith("#") and "abstract" in line.lower():
            abstract_line = i
            break
    return "\n".join([lines[0], ""] + lines[abstract_line:])
'''
processed_mmd = remove_authors(mmd_content)
print(processed_mmd)

# Attention Is All You Need

###### Abstract

The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models