In [1]:
import os
import cv2
import json
import numpy as np
import pandas as pd

from PIL import Image
from shutil import copy
from pathlib import Path

zenodo_path = "zenodo"
metadata_path = "metadata"
data_path = "data"

%load_ext autoreload
%autoreload 2

# Raphaël Barman: Stocks dataset
This notebook extracts every annotated stock tables from Raphaël Barman's dataset: https://zenodo.org/record/3706863.

In [2]:
with open(os.path.join(zenodo_path, "GDL.json"), "r") as f:
    GDL_annotations = json.load(f)
    
with open(os.path.join(zenodo_path, "IMP.json"), "r") as f:
    IMP_annotations = json.load(f)
    
with open(os.path.join(zenodo_path, "JDG.json"), "r") as f:
    JDG_annotations = json.load(f)

In [3]:
GDL_annotations = {k: v for k, v in GDL_annotations['_via_img_metadata'].items() if len(v['regions']) > 0}
IMP_annotations = {k: v for k, v in IMP_annotations['_via_img_metadata'].items() if len(v['regions']) > 0}
JDG_annotations = {k: v for k, v in JDG_annotations['_via_img_metadata'].items() if len(v['regions']) > 0}

In [4]:
def filter_stocks(annotations):
    stocks_annotations = dict()
    for k, v in annotations.items():
        for region in v['regions']:
            v['regions'] = [x for x in v['regions'] if x['region_attributes']['label'] == 'stocks']

        if len(v['regions']) > 0:
            stocks_annotations[k] = v
    
    return stocks_annotations

GDL_stocks = filter_stocks(GDL_annotations)
print(f"GDL has {sum([len(v['regions']) for k,v in GDL_stocks.items()])} stocks tables.")
IMP_stocks = filter_stocks(IMP_annotations)
print(f"IMP has {sum([len(v['regions']) for k,v in IMP_stocks.items()])} stocks tables.")
JDG_stocks = filter_stocks(JDG_annotations)
print(f"JDG has {sum([len(v['regions']) for k,v in JDG_stocks.items()])} stocks tables.")

GDL has 290 stocks tables.
IMP has 109 stocks tables.
JDG has 429 stocks tables.


In [5]:
# This method extracts all the newspaper pages that contain tables, as well as the tables
# Generates a file named RB_metadata.parquet necessary for the GUI
# Generates a file named RB_metadata_images.json storing the size of each image
def generate_dataset(annotations, journal_name):

    input_path = os.path.join(zenodo_path, "images/images")
    export_path = data_path
            
    cropped_images_export_path = os.path.join(export_path, "cropped images")
    os.makedirs(cropped_images_export_path, exist_ok=True)
    full_images_export_path = os.path.join(export_path, "full images")
    os.makedirs(full_images_export_path, exist_ok=True)

    metadata = pd.DataFrame(columns=['id', 'pid', 'id_loc', 'pid_loc'])
    metadata_images = {}
        
    export_annotations = annotations.copy()
    for k, v in annotations.items():
        name = v['filename']
        copy(os.path.join(input_path, journal_name, name), full_images_export_path)
        image = Image.open(os.path.join(input_path, journal_name, name))
        metadata_images[k[:-2]] = {'height': image.height,
                              'width': image.width,
                              'resized_height': image.height,
                              'resized_width': image.width}
        
        regions_copy = v['regions']
        for i, region in enumerate(v['regions']):
            try:
                cropped_image_id = k + f"-{i}"
                cropped_image_export_path = os.path.join(cropped_images_export_path, cropped_image_id + ".jpg")
                metadata.loc[len(metadata)] = [cropped_image_id, 
                                               k, 
                                               os.path.abspath(cropped_image_export_path), 
                                               os.path.abspath(os.path.join(full_images_export_path, name))]
            
                if not Path(cropped_image_export_path).exists():
                    if region['shape_attributes']['name'] == 'rect':
                        x = region['shape_attributes']['x']
                        y = region['shape_attributes']['y']
                        width = region['shape_attributes']['width']
                        height = region['shape_attributes']['height']
                        cropped_image = image.crop((x, y, x + width, y + height))
                        cropped_image.save(cropped_image_export_path)

                    elif region['shape_attributes']['name'] == 'polygon':
                        # https://stackoverflow.com/questions/48301186/cropping-concave-polygon-from-image-using-opencv-python
                        
                        # https://stackoverflow.com/questions/33548956/detect-avoid-premature-end-of-jpeg-in-cv2-python
                        with open(os.path.join(input_path, journal_name, name), 'rb') as f:
                            check_chars = f.read()[-2:]
                        if check_chars != b'\xff\xd9':
                            print('Not complete image')
                            raise OSError
                        else:
                            image_cv = cv2.imread(os.path.join(input_path, journal_name, name))

                        all_points_x = region['shape_attributes']['all_points_x']
                        all_points_y = region['shape_attributes']['all_points_y']
                        points = np.array([[x, y] for (x, y) in zip(all_points_x, all_points_y)])

                        ## (1) Crop the bounding rect
                        rect = cv2.boundingRect(points)
                        x, y, w, h = rect
                        cropped_image_cv = image_cv[y:(y + h), x:(x + w)].copy()

                        ## (2) make mask
                        points = points - points.min(axis=0)
                        mask = np.zeros(cropped_image_cv.shape[:2], np.uint8)
                        cv2.drawContours(mask, [points], -1, (255, 255, 255), -1, cv2.LINE_AA)

                        ## (3) do bit-op
                        dst = cv2.bitwise_and(cropped_image_cv, cropped_image_cv, mask=mask)

                        cv2.imwrite(cropped_image_export_path, dst)
                    else:
                        print(f"Could not process region of type {region['shape_attributes']['name']}.")
            except OSError as e:
                annotations[k]['regions'].remove(region)
                print(f"{e}: {k} and {region}")
                
    annotations_path = os.path.join(export_path, "via_annotations_RB.json")
    if Path(annotations_path).exists():
        with open(annotations_path, "r") as f:
            export_annotations.update(json.load(f))
            
    with open(annotations_path, "w") as f:
        json.dump(export_annotations, f)
        
    metadata_images_path = os.path.join(export_path, "RB_metadata_images.json")
    if Path(metadata_images_path).exists():
        with open(metadata_images_path, "r") as f:
            metadata_images.update(json.load(f))
            
    with open(metadata_images_path, "w") as f:
        json.dump(metadata_images, f)

    metadata_path = os.path.join(export_path, "RB_metadata.parquet")
    if Path(metadata_path).exists():
        metadata = pd.concat([metadata, pd.read_parquet(metadata_path)]).reset_index(drop=True)

    metadata.to_parquet(metadata_path)

In [6]:
generate_dataset(GDL_stocks, "GDL")

image file is truncated (32 bytes not processed): GDL-1987-03-24-a-p0007-1 and {'shape_attributes': {'name': 'rect', 'x': 974, 'y': 710, 'width': 292, 'height': 179}, 'region_attributes': {'label': 'stocks'}}
Not complete image
: GDL-1987-03-24-a-p0007-1 and {'shape_attributes': {'name': 'polygon', 'all_points_x': [661, 664, 661, 1265, 1266, 977, 971], 'all_points_y': [1416, 1408, 896, 903, 1478, 1479, 1415]}, 'region_attributes': {'label': 'stocks'}}
image file is truncated (32 bytes not processed): GDL-1987-03-24-a-p0007-1 and {'shape_attributes': {'name': 'rect', 'x': 974, 'y': 1507, 'width': 295, 'height': 187}, 'region_attributes': {'label': 'stocks'}}


In [7]:
generate_dataset(IMP_stocks, "IMP")

In [8]:
generate_dataset(JDG_stocks, "JDG")