
## Description
This notebook builds on the outputs created beforehand by running the demo.py script and additional OCR information.
For convenience, we add the outputs of DSG and additional an OCR text file for one sample in this directory.

For use of relative paths, please make sure you are in the base directory (`DSG`) of this repository. 

In [None]:
import os
cwd = os.path.basename(os.getcwd())
print('current dir: {}'.format(cwd))
if cwd != 'DSG' and cwd == 'sysdemo':
    os.chdir('..')
    print('changed dir to: {}'.format(os.getcwd()))
#assert cwd == 'DSG'

In [None]:
from xmlrpc.client import MAXINT
import torch
import os
from PIL import Image
from pathlib import Path

from segmentationsg.data import add_dataset_config, register_datasets
from segmentationsg.modeling.roi_heads.scenegraph_head import add_scenegraph_config
from detectron2.config import get_cfg

import glob
def setup_cfg(config_file, output_dir, confidence_threshold, config_list):
    # load config from file and command-line arguments
    cfg = get_cfg()
    add_dataset_config(cfg)
    add_scenegraph_config(cfg)
    assert(cfg.MODEL.ROI_SCENEGRAPH_HEAD.MODE in ['predcls', 'sgls', 'sgdet']) , "Mode {} not supported".format(cfg.MODEL.ROI_SCENEGRaGraph.MODE)
    cfg.merge_from_file(config_file)
    #cfg.merge_from_list(args.opts)
    cfg.merge_from_list(config_list)
    # Set score_threshold for builtin models
    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = confidence_threshold
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = confidence_threshold
    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = confidence_threshold

    register_datasets(cfg)
    #default_setup(cfg, args)
    cfg.freeze()
    #print(cfg)
    return cfg

In [None]:
# make sure the datasets and checkpoints folders that you downloaded from the google drive have been extracted to the correct folder 
# dsg specific commands should be executed from the base folder

#path to config file
config_file = './configs/sgg_end2end_EP.yaml'
#path to model checkpoint 
config_list = ["MODEL.WEIGHTS", './checkpoints/DSG_E2E_eperiodica/dsg_e2e_eperiodica_checkpoint.pth']
confidence_threshold = 0.5
output_dir = './sysdemo/'

#important that this is executed from the base folder and that checkpoints and configs are in there
cfg = setup_cfg(config_file, output_dir, confidence_threshold, config_list)

from segmentationsg.utils.visualizer import SGVisualizer
from detectron2.data import MetadataCatalog
from detectron2.utils.visualizer import ColorMode
from detectron2.data.detection_utils import read_image



metadata = MetadataCatalog.get(
    cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)

In [None]:
# define class mapping list that we use for postprocessing and hocr file creation
class_mapping_list = metadata.thing_classes
print(class_mapping_list)

In [None]:
# we run our example demonstration on the image chr-001_1974_013_0199 from the eperiodica dataset
raw_tensor_path = os.path.join(output_dir, "chr-001_1974_013_0199.pt")
ocr_file_path = os.path.join(output_dir, "chr-001_1974_013_0199.txt")
image_path = os.path.join(output_dir, "chr-001_1974_013_0199.jpg")
image_instances_path = os.path.join(output_dir, "chr-001_1974_013_0199_instances.jpg")

In [None]:
print(ocr_file_path)

In [None]:
# image that we look at
im = Image.open(image_path)
im.show()

In [None]:
# detected instances
im_instances = Image.open(image_instances_path)
im_instances.show()

In [None]:
# tensor that contains the predicted bounding boxes, the predicted classes/categories, and relationships + score
# for each instance pair
raw_tensor = torch.load(raw_tensor_path)
#raw_tensor

In [None]:
# postprocessed the raw_tensor, output is a tensor with a tree structure for parentof relations
from segmentationsg.utils import postprocessing
tensor_before_postprocessing, postprocessed_tensor = postprocessing.postprocess_raw_tensor(raw_tensor, class_mapping_list)

#postprocessed_tensor

In [None]:
# create hocr file from postprocessed tensor
from segmentationsg.utils.makehocr import create_hocr
#output_folder = "./"
filename_hocr = os.path.basename(ocr_file_path.replace(".txt", ".hocr"))
root_hocr = create_hocr(postprocessed_tensor, ocr_file_path, class_mapping_list, output_dir, filename_hocr)
output_path_hocr = os.path.join(output_dir, filename_hocr)

In [None]:
# some example queries using lxml
from lxml import etree as ET
root_hocr = ET.parse(output_path_hocr)
root_hocr

In [None]:
# get rows that are children of tabular
rows = root_hocr.xpath('//div[@dgg_class="tabular"]/*/div[@dgg_class="row"]')
print(rows)

In [None]:
# find the node that contains the word "Schriftsteller"
contains_schriftsteller = root_hocr.xpath('//span[text()="Schriftsteller"]/..')

#important to note is that .xpath returns a list of elements that match the path, even if there's only 1 node that matches it
#this means that we have to access contains_schriftsteller[0]

print(contains_schriftsteller)
print()

print(contains_schriftsteller[0].attrib)
print()

# print all words contained in the row that contains the word "Schrifsteller"
for word in contains_schriftsteller[0].iterchildren():
    print(word.text)

In [None]:
#find all headings
headings = root_hocr.xpath('//div[@dgg_class="header"]')

#print the first 3 heading ids and their textual contents
for heading in headings[:3]:
    print("heading id: "+heading.attrib["dgg_id"])
    for word in heading:
        print(word.text)
    print()

In [None]:
#find the textblock that follows a heading that contains the word "Biographie"
from segmentationsg.utils.makehocr import followedby
textblock_after_biographie = followedby('//div[@dgg_class="header"]/span[text()="Biographie"]/..', '//div[@dgg_class="contentblock"]', root_hocr)
print(textblock_after_biographie[0].attrib)

In [None]:
#print the first 15 words contained in that textblock
for word in list(textblock_after_biographie[0].iterchildren())[:15]:
    print(word.text)