In [1]:
# Use for checking files in dir
import os

# Extract each pdf page to image
from pdf2image import convert_from_path 
from pdf2image.exceptions import (
 PDFInfoNotInstalledError,
 PDFPageCountError,
 PDFSyntaxError
)

# Image Processing
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec

In [2]:
# MAIN INPUT 
# Input codes (only in .pdf files)
path1 = 'reports/DBS Sustainability Report 2020.pdf'
path2 = 'reports/UOB-Sustainability-Report-2020.pdf'
path3 = 'reports/Daikin_SR_report.pdf'
path4 = 'reports/pingan SR report.pdf'

# Report to extract graph/charts
path = path3

# TODO: Input list of paths 

In [3]:
# To create a folder name by extracting file name (for e.g. <fileName>_output)
def get_header(fileName):
    x = fileName.split("/")
    substring= ".pdf"
    for fullstring in x:
        if substring in fullstring:
            header = fullstring[:-4] + "_Output"
            return header

In [4]:
fileHeader = get_header(path)
images = convert_from_path(path)
os.makedirs(fileHeader)

In [5]:
# Convert each pdf page to images
for i, image in enumerate(images):
    fname = 'page'+str(i+1)+'.png'
    image.save(fileHeader + '/' + fname, "PNG")

In [6]:
# Create Folder for images output
output_file = fileHeader + "/articles"
os.makedirs(output_file)

In [1]:
# Set parameters for graph detection
set_column_gap = 15
set_height_limit = 180
set_width_limit = 180
set_area_limit = 80000

set_scale_factor = 2.4 # Extend horizontal and vertical axis of bounding boxes
scale_horizontal = set_scale_factor*64
scale_vertical = set_scale_factor*64

def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

# Method to check if extracted article is valid
def checkDim(height, width, area):
    #print(f"{height} x {width} = {area}")
    if(height <= set_height_limit):
        return False
    if(width <= set_width_limit):
        return False
    if(area <= set_area_limit):
        return False
    return True
    
# Main image processing method to draw bounding lines of graphs/charts
def page_to_articles(pageNum):
    print(f"=>starting on page {pageNum}")
    img = cv2.imread(fileHeader+'/page%d.png' %pageNum, cv2.IMREAD_COLOR) # Identify img
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # turn img to grey
    img_gray_inverted = 255 - img_gray # Invert back to normal

    row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
    row_gaps = zero_runs(row_means)
    row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
    
    bounding_boxes = []
    for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
        line = img[int(start):int(end)]
        line_gray_inverted = img_gray_inverted[int(start):int(end)]

        column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
        column_gaps = zero_runs(column_means)
        column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
        column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2

        filtered_cutpoints = column_cutpoints[column_gap_sizes > set_column_gap] # this part can use ML too 

        for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
            bounding_boxes.append(((int(xstart), int(start)), (int(xend), int(end))))
    
    count = 0
    for bounding_box in bounding_boxes:
        result = img.copy()
    
        count = count + 1

        height = bounding_box[1][1]-bounding_box[0][1]
        width = bounding_box[1][0]-bounding_box[0][0]
        area = height * width
       
        # represents the top left corner of rectangle
        # (x_top, y_top) 
        # x_top = bounding_box[0][0]
        # y_top = bounding_box[0][1]
        
        # represents the bottom right corner of rectangle
        # (x_bot, y_tbot) 
        # x_bot = bounding_box[1][0]
        # y_bot = bounding_box[1][1]
        
        # height = y_bot - y_top
        # height =  bounding_box[1][1] - bounding_box[0][1]
        
        # width = x_bot - x_top
        # width = bounding_box[1][0] - bounding_box[0][0]
        
        if(checkDim(height, width, area) == True):
            # Extend bounding lines
            new_x_top = bounding_box[0][0]-int(scale_horizontal)
            new_y_top = bounding_box[0][1]-int(scale_vertical)
            new_x_bot = bounding_box[1][0]+int(scale_horizontal)
            new_y_bot = bounding_box[1][1]+int(scale_vertical)
            
            # To prevent error on -ve values
            if(new_x_top < 0):
                new_x_top = 0
            if(new_y_top < 0):
                new_y_top = 0
            if(new_x_bot < 0):
                new_x_bot = 0
            if(new_y_bot < 0):
                new_y_bot = 0
                
            # Re-calculate bounding lines
            new_width = new_x_bot - new_x_top
            new_height = new_y_bot - new_y_top
            
            # if condition pass, crop image and output
            X, Y, W, H = new_x_top, new_y_top, new_width, new_height
            cropped_image = result[Y:Y+H, X:X+W]
            
            #print([X,Y,W,H])
            article_output_file = output_file + "/page" + str(pageNum) 
            cv2.imwrite(article_output_file + "_ex_%d.png" % count, cropped_image)
            print(f"Extracted output file: {article_output_file}_ex_{count}.png")
    
    # Catch all bounding box
    # cv2.imwrite(output_file+"/compile_page%d_output.png" %pageNum, result)
    
# TODO: filter page with relevant ESG keywords
def check_page(pageNum):
    if(pageNum%2==0):
        return True
    else:
        return False

In [2]:
# Filter pages to relevant pages related to ESG
filtered_list = []
for page in range(len(images)):
    pageNum = page + 1
    print(f"==> Now doing page {pageNum} ...")
    if(check_page(pageNum)):
        filtered_list.append(pageNum)
filtered_list

NameError: name 'images' is not defined

In [25]:
for page in filtered_list:
    print(f"==> Now doing page {page} ...")
    page_to_articles(page)
    print(f"==> Finished Page {page} ...\n")

==> Now doing page 2 ...
=>starting on page 2
Extracted output file: Daikin_SR_report_Output/articles/page2_ex_1.png
Extracted output file: Daikin_SR_report_Output/articles/page2_ex_5.png
Extracted output file: Daikin_SR_report_Output/articles/page2_ex_6.png
==> Finished Page 2 ...

==> Now doing page 3 ...
=>starting on page 3
==> Finished Page 3 ...

==> Now doing page 4 ...
=>starting on page 4
==> Finished Page 4 ...

==> Now doing page 5 ...
=>starting on page 5
==> Finished Page 5 ...

==> Now doing page 6 ...
=>starting on page 6
==> Finished Page 6 ...

==> Now doing page 7 ...
=>starting on page 7
==> Finished Page 7 ...

==> Now doing page 8 ...
=>starting on page 8
==> Finished Page 8 ...

==> Now doing page 9 ...
=>starting on page 9
==> Finished Page 9 ...

==> Now doing page 10 ...
=>starting on page 10
Extracted output file: Daikin_SR_report_Output/articles/page10_ex_1.png
Extracted output file: Daikin_SR_report_Output/articles/page10_ex_3.png
Extracted output file: Daik

Extracted output file: Daikin_SR_report_Output/articles/page35_ex_51.png
Extracted output file: Daikin_SR_report_Output/articles/page35_ex_52.png
==> Finished Page 35 ...

==> Now doing page 36 ...
=>starting on page 36
Extracted output file: Daikin_SR_report_Output/articles/page36_ex_1.png
Extracted output file: Daikin_SR_report_Output/articles/page36_ex_2.png
Extracted output file: Daikin_SR_report_Output/articles/page36_ex_12.png
Extracted output file: Daikin_SR_report_Output/articles/page36_ex_13.png
Extracted output file: Daikin_SR_report_Output/articles/page36_ex_18.png
==> Finished Page 36 ...



Observations:
- Noisy articles extracted
- Managed to extract all charts
- If report is in landscape, the extraction method will be inaccurate
- Works well in reports with portrait layout. E.g. Daikin, Ping An
- 