In [1]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
import glob
import os

from git_root import git_root
my_git_root = git_root()

In [2]:
config = {
    'output_format': 'markdown',
    'disable_image_extraction': True,
    'force_ocr': False,
    'languages': 'en'
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer()
)

Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded texify model to cuda with torch.float16 dtype
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


In [3]:
def get_filenames(location, filetype):
    filenames = set()
    os.chdir(location)
    for filename in glob.glob(f'*.{filetype}'):
        filenames.add(filename)
    return sorted(filenames)

In [4]:
pdf_location = f'{my_git_root}/data/pdf/'
filenames = get_filenames(pdf_location, 'pdf')

In [5]:
documents = []

for filename in filenames:
    rendered = converter(pdf_location + filename)
    text, _, images = text_from_rendered(rendered)
    documents.append(text)

Recognizing layout: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
100%|██████████| 1/1 [00:00<00:00, 104.49it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 0it [00:00, ?it/s]
Recognizing layout: 100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
100%|██████████| 1/1 [00:00<00:00, 263.96it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  5.41it/s]
Recognizing layout: 100%|██████████| 2/2 [00:00<00:00,  3.28it/s]
100%|██████████| 2/2 [00:00<00:00, 169.07it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  4.48it/s]
Recognizing layout: 100%|██████████| 3/3 [00:01<00:00,  2.80it/s]
100%|██████████| 3/3 [00:00<00:00, 155.25it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 0it [00:00, ?it/s]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  7.01i

In [6]:
import pandas as pd

df = pd.DataFrame(data={
    'name': filenames,
    'text': documents
})

In [7]:
df

Unnamed: 0,name,text
0,aalto-university.pdf,Aalto University Research Data Management Poli...
1,aberystwyth-university.pdf,## **Research Data Management Policy**\n\nVers...
2,aston-university.pdf,# **Research Data Management Policy**\n\nJuly ...
3,bangor-university.pdf,# **Data Protection Policy**\n\n| Rev | Date |...
4,brunel-university-london.pdf,# **Brunel University Research Data Management...
...,...,...
137,universität-rostock.pdf,# **Research Data Policy of the University of ...
138,universität-siegen.pdf,## **Research-Data-Policy of the University of...
139,universität-stuttgart.pdf,## **Research data management policy of the Un...
140,utrecht-university.pdf,# **University policy framework for research d...


In [8]:
df.to_csv(f'{my_git_root}/data/marker.csv', index=False)