In [1]:
import os
import sys
import random
import torch
import torchvision
import pytesseract
from pdf2image import convert_from_path
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.transforms import transforms

import cv2
import numpy as np

from utils import (
    overlay_ann,
    overlay_mask,
    show,
    extract_elements
)

In [2]:
seed = 1234
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


CATEGORIES2LABELS = {
    0: "bg",
    1: "text",
    2: "title",
    3: "list",
    4: "table",
    5: "figure"
}

In [3]:
def get_instance_segmentation_model(num_classes):
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256

    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )
    return model

In [4]:
def get_page_elements(image_path):
    # model 
    num_classes = 6
    model = get_instance_segmentation_model(num_classes)
    model.cuda()

    if os.path.exists('model_196000.pth'):
        checkpoint_path = "model_196000.pth"
    else:
        checkpoint_path = "../../../Downloads/model_196000.pth"

    assert os.path.exists(checkpoint_path)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model'])
    model.eval()
    
    # NOTE: custom  image
    assert os.path.exists(image_path)
    image_name = (os.path.basename(image_path)).split('.')[0]
    print(image_name)
    elements_path = f'{os.path.dirname(image_path)}/{image_name}_elements'
    try:
        os.mkdir(elements_path)
    except:
        print ("Creation of the directory %s failed" % elements_path)
    else:
        print("Successfully created the directory %s " % elements_path)
        
        
    image = cv2.imread(image_path)
    rat = 1300 / image.shape[0]
    image = cv2.resize(image, None, fx=rat, fy=rat)

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor()
    ])
    image = transform(image)

    with torch.no_grad():
        prediction = model([image.cuda()])

    image = torch.squeeze(image, 0).permute(1, 2, 0).mul(255).numpy().astype(np.uint8)
    
    ROI_number = 0 
    for pred in prediction:
        for idx, mask in enumerate(pred['masks']):
            if pred['scores'][idx].item() < 0.7:
                continue

            m = mask[0].mul(255).byte().cpu().numpy()
            box = list(map(int, pred["boxes"][idx].tolist()))
            label = CATEGORIES2LABELS[pred["labels"][idx].item()]

            score = pred["scores"][idx].item()

            # image = overlay_mask(image, m)
            extract_elements(image, box, label, ROI_number, elements_path)
            image = overlay_ann(image, m, box, label, score)
            ROI_number += 1
    image_save_path = (f'./{elements_path}/masked_{os.path.basename(image_path)}')
    cv2.imwrite(image_save_path, image)
    # show(image)

In [5]:
def main(argv):
    if len(argv) > 0 and os.path.exists(argv[0]):
        file_path = argv[0]
    else:
        file_path = 'CVPR2017.pdf'
    
    pages = convert_from_path(file_path, dpi=200)
    file_name = file_path.split('.')[0]
    try:
        os.mkdir(f'../tmp/images/{file_name}')
    except:
        print ("Creation of the directory %s failed" % f'../tmp/images/{file_name}')
    else:
        print("Successfully created the directory %s " % f'../tmp/images/{file_name}')
    
    for idx,page in enumerate(pages):
        page.save(f"../tmp/images/CVPR2017/{idx}.png", 'PNG')
        image_path = f'../tmp/images/CVPR2017/{idx}.png'
        get_page_elements(image_path)    

In [6]:
if __name__ == "__main__":
    import sys
    argv = sys.argv[1:]
    main(argv)

Successfully created the directory ../tmp/images/CVPR2017 
0
Successfully created the directory ../tmp/images/CVPR2017/0_elements 
Document semantic structure extraction (DSSE) is an
ctively-researched area dedicated to understanding images
f documents. The goal is to split a document image into re-
ions of interest and to recognize the role of each region. It
s usually done in two steps: the first step, often referred to
s page segmentation, is appearance-based and attempts to
istinguish text regions from regions like figures, tables and
ine segments. The second step, often referred to as logical
tructure analysis, is semantics-based and categorizes each
egion into semantically-relevant classes like paragraph and
aption

Label:
text0
We present an end-to-end, multimodal, fully convolu-
tional network for extracting semantic structures from doc-
ument images. We consider document semantic structure
extraction as a pixel-wise segmentation task, and propose a
unified model that classifi

2. Background

Label:
title9
2
Successfully created the directory ../tmp/images/CVPR2017/2_elements 
Unsupervised Learning. Several methods have been
proposed to use unsupervised learning to improve super-
vised learning tasks. Mairal et al. [36] proposed a sparse
coding method that leans sparse local features by sparsity-
constrained reconstruction loss functions. Zhao et al. [58]
proposed a Stacked What-Where Auto-Encoder that uses
unpooling during reconstruction, By injecting noise into the
input and the middle features, a denoising auto-encoder [51]
can learn robust filters that recover uncorrupted input. The
main focus in unsupervised learning has been image-level
classification and generative approaches, whereas in this pa-
per we explore the potential of such methods for pixel-wise
semantic segmentation.

Label:
text0
As shown In Fig. 2, our MPCN mode! has tour parts:
an encoder, two decoders and a bridge. The encoder and
Jecoder parts roughly follow the architecture guideline

Figure 4: Left: A dilated block that contains 5 dilated
convolutional layers with different dilation d.  Batch-
Normalization and non-linearity are not shown for brevity.
Right: The skip-gram model for word embeddings.

Label:
text7
3.2. Text Embedding Map

Label:
title8
Although our synthetic documents (Sec. 4) provide a
jarge amount of labeled data for training, they are limited
in the Variations of their layouts. To this end, we define two
unsupervised loss functions to make use of real documents
and to encourage better representation learning.

Label:
text9
ROCORSTUCHION PRsk, st has Ocen shown Mat recon-
struction can help learning better representations and there-
fore improves performance for supervised tasks [58, 57]
We thus introduce a second decoder pathway (Fig. 2 - axil-
lary decoder), denoted as Dee, and define a reconstruction
loss at intermediate features. This auxiliary decoder only
exists during the training phase.

Label:
text10
W. given an input word w; 1s define

ENR I ERIE SINE COME LIOIES LOBE TORE BEOMINEN SET
Model2. Note that this model is similar to the SharpMask
model. We observe a mean IoU of 65.4%, 4% better than
the base model. The improvements are even more signifi-
cant for small objects like captions.

Label:
text3
intersection-over-union (1oU), which 1s standard in seman-
‘ic segmentation tasks. We optimize the architecture of
our MFCN model based on the DSSE-200 dataset since
i contains both appearance-based and semantics-based la-
pels. Sec. 6.4 compares our results to state-of-the-art meth-
ods on the ICDAR2015 and SectLabel datasets.

Label:
text4
Finally, we investigate the use of dilated convolutions.
Model3 is equivalent to using dilated convolution when
d = 1. Modeld sets d = 8 while Model5 uses the di-
lated block illustrated in Fig. 4 (left). The number of output
channels are adjusted such that the total number of parame-

Label:
text5
We first systematically evaluate the effectiveness of dif-
ferent network architect

This work started during Xiao Yang’s internship at
Adobe Research. This work was supported by NSF grant
CCF 1317560 and Adobe Systems Inc.

Label:
text5
Table 4: IoU scores (%) for page segmentation on the
ICDAR2015 dataset. For comparison purpose, only IoU
scores for non-text, text and figure are shown. However our
model can make fine-grained predictions as well.

Label:
text6
ing that our MFCN model simultaneously predicts both
appearance-based and semantics-based classes while other
methods can not,

Label:
text7
| Methods | non-text | text |
Teptonica [8] MT [868
Bukhari et al. [10] 90.6 | 90.3
Ours (binary) 945 | 91.0
Fernandez et al. {17} | 70.1 | 85.8
Ours (binary) 771 | 91.0

Label:
table8
fable 5: Fl scores on the SectLabel dataset. Note that our
model can also identify non-text classes such as figures and
ables.

Label:
text9
7. Conclusion:

Label:
title10
Acknowledgment

Label:
title11
Table 3: IoU scores (%) when using different training ob-
jectives on DSSE-200 data

References

Label:
title2
9
Successfully created the directory ../tmp/images/CVPR2017/9_elements 
Proceedings of ine Fourth internationat Conjerence on, vol-
ume 2, pages 984-988. IEEE, 1997. 2

[32] T-Y. Lin, M. Maire, S. Belongie, J. Hays, P. Perona, D. Ra-
manan, P, Dollar, and C. L. Zitnick. Microsoft coco: Com-
mon objects in context. In European Conference on Com-
puter Vision, pages 740-755. Springer, 2014. 3, 5

[33] J. Long, E. Shelhamer, and T. Darrell. Fully convolutional
networks for semantic segmentation, In Proceedings of the
IEEE Conference on Computer Vision and Pattern Recogni-
tion, pages 3431-3440, 2015. 3

[34] Z. Lu, Z. Fu, T. Xiang, P. Han, L. Wang, and X. Gao. Learn-
ing from weak and noisy labels for semantic segmentation,
2016. 3

[35] M.-T, Luong, T. D. Nguyen, and M.-Y. Kan, Logical struc-
ture recovery in scholarly articles with rich document fea-
tures, Multimedia Storage and Retrieval Innovations for Dig-
ital Library Systems, 270, 2012. 2, 6,8

[36] J. M