In [1]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
import glob
import os

from git_root import git_root
my_git_root = git_root()

In [2]:
config = {
    'output_format': 'markdown',
    'disable_image_extraction': True,
    'force_ocr': True,
    'languages': 'en'
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer()
)

Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded texify model to cuda with torch.float16 dtype
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


In [3]:
def get_filenames(location, filetype):
    filenames = set()
    os.chdir(location)
    for filename in glob.glob(f'*.{filetype}'):
        filenames.add(filename)
    return sorted(filenames)

In [4]:
pdf_location = f'{my_git_root}/data/pdf/'
filenames = get_filenames(pdf_location, 'pdf')

In [5]:
documents = []

for filename in filenames:
    rendered = converter(pdf_location + filename)
    text, _, images = text_from_rendered(rendered)
    documents.append(text)

Recognizing layout: 100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
100%|██████████| 1/1 [00:00<00:00, 130.48it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.99it/s]
Recognizing Text: 100%|██████████| 4/4 [00:03<00:00,  1.03it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 0it [00:00, ?it/s]
Recognizing layout: 100%|██████████| 1/1 [00:00<00:00,  3.19it/s]
100%|██████████| 1/1 [00:00<00:00, 274.28it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  3.65it/s]
Recognizing Text: 100%|██████████| 3/3 [00:02<00:00,  1.21it/s]
Recognizing equations: 0it [00:00, ?it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  7.61it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  9.39it/s]
Recognizing layout: 100%|██████████| 2/2 [00:00<00:00,  3.24it/s]
100%|██████████| 2/2 [00:00<00:00, 309.84it/s]
Detecting bboxes: 100%|██████████| 2/2 [00:01<00:00,  1.95it/s]
Recognizing Text: 100%|█

In [6]:
import pandas as pd

df = pd.DataFrame(data={
    'name': filenames,
    'text': documents
})

In [7]:
df

Unnamed: 0,name,text
0,aalto-university.pdf,Aalto University Research Data Management Poli...
1,aberystwyth-university.pdf,## Research Data Management Policy\n\nVersion ...
2,aston-university.pdf,# Research Data Management Policy\n\nJuly 2023...
3,bangor-university.pdf,# Data Protection Policy\n\n| Rev | Date | Pur...
4,brunel-university-london.pdf,# Brunel University Research Data Management P...
...,...,...
137,universität-rostock.pdf,# Research Data Policy of the University of Ro...
138,universität-siegen.pdf,## CREATING A COMMON FUTURE\n\n## Research-Dat...
139,universität-stuttgart.pdf,## Research data management policy of the Univ...
140,utrecht-university.pdf,# University policy framework for research dat...


In [9]:
df.to_csv(f'{my_git_root}/marker_ocr.csv', index=False)