# Chart Extraction 
## 1. Install all the prerequisite 

In [None]:
!pip install pdf2image
!pip install pytesseract
!pip install opencv-python
!python --version

## 2. Import relevant packages

In [1]:
# Use for checking files in dir
import os

# Extract each pdf page to image
from pdf2image import convert_from_path 
from pdf2image.exceptions import (
 PDFInfoNotInstalledError,
 PDFPageCountError,
 PDFSyntaxError
)

# Image Processing
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pytesseract

## 3. Gather Inputs

In [116]:
# MAIN INPUT 
# Input codes (only in .pdf files)
path1 = 'reports/DBS Sustainability Report 2020.pdf'
path2 = 'reports/Daikin_SR_report.pdf'
path3 = 'reports/pingan SR report.pdf'
path4 = 'ChartExtraction_Output/Citibank_nan'
path5 = 'ChartExtraction_Output/ICBC_2020'
path6 = 'ChartExtraction_Output/BDO Unibank_2020'

# Change report
path = path4 
source_path = path + '/pages'
output_path = path + '/test'
print(source_path)
print(output_path)

ChartExtraction_Output/Citibank_nan/pages
ChartExtraction_Output/Citibank_nan/test


## 4. Program Methods

In [117]:
# Optimal parameters for graph detection
set_column_gap = 15
set_height_limit = 0
set_width_limit = 0
set_area_limit = 80000

set_scale_factor = 2.5 # Extend horizontal and vertical axis of bounding boxes
scale_horizontal = set_scale_factor*64
scale_vertical = set_scale_factor*64

def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

# Method to check if extracted article is valid
def checkDim(height, width, area):
    #print(f"{height} x {width} = {area}")
    if(height <= set_height_limit):
        return False
    if(width <= set_width_limit):
        return False
    if(area <= set_area_limit):
        return False
    return True
def process_image(image, pageNum, task):
    original = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20,10))
    dilate = cv2.dilate(thresh, kernel, iterations=2)
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        x,y,w,h = cv2.boundingRect(c)
        area = cv2.contourArea(c)
        if w/h > 2 and area > 1000:
            cv2.drawContours(dilate, [c], -1, (0,0,0), -1)   
            
    boxes = []
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        boxes.append([x,y, x+w,y+h])
    
    boxes = np.asarray(boxes)
    try:
        x = np.min(boxes[:,0])
        y = np.min(boxes[:,1])
        w = np.max(boxes[:,2]) - x
        h = np.max(boxes[:,3]) - y
    except:
        print("error")
    cv2.rectangle(image, (x,y), (x + w,y + h), (36,255,12), 3)
    ROI = original[y:y+h, x:x+w]

    cv2.imwrite(path + "/test/GDFI_"+str(pageNum)+str(task)+"_img.png", image)
    cv2.imwrite(path +"/test/GDFI_"+str(pageNum)+str(task)+"_thresh.png", thresh)
    cv2.imwrite(path +"/test/GDFI_"+str(pageNum)+str(task)+"_dilate.png", dilate)
    cv2.imwrite(path +"/test/GDFI_"+str(pageNum)+str(task)+"_ROI.png", ROI)
    
def process_image2(img):
    original = image.copy()
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # turn img to grey
    img_gray_inverted = 255 - img_gray # Invert back to normal

    row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
    row_gaps = zero_runs(row_means)
    row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
    
    bounding_boxes = []
    for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
        line = img[int(start):int(end)]
        line_gray_inverted = img_gray_inverted[int(start):int(end)]

        column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
        column_gaps = zero_runs(column_means)
        column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
        column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2

        filtered_cutpoints = column_cutpoints[column_gap_sizes > set_column_gap] # this part can use ML too 

        for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
            bounding_boxes.append(((int(xstart), int(start)), (int(xend), int(end))))

    count = 0
    result = img.copy()

    for bounding_box in bounding_boxes:
        
        height = bounding_box[1][1]-bounding_box[0][1]
        width = bounding_box[1][0]-bounding_box[0][0]
        area = height * width
       
        # represents the top left corner of rectangle
        # (x_top, y_top) 
        # x_top = bounding_box[0][0]
        # y_top = bounding_box[0][1]
        
        # represents the bottom right corner of rectangle
        # (x_bot, y_tbot) 
        # x_bot = bounding_box[1][0]
        # y_bot = bounding_box[1][1]
        
        # height = y_bot - y_top
        # height =  bounding_box[1][1] - bounding_box[0][1]
        
        # width = x_bot - x_top
        # width = bounding_box[1][0] - bounding_box[0][0]
        
        if(checkDim(height, width, area) == True):
            count = count + 1
            # Extend bounding lines
            new_x_top = bounding_box[0][0]-int(scale_horizontal)
            new_y_top = bounding_box[0][1]-int(scale_vertical)
            new_x_bot = bounding_box[1][0]+int(scale_horizontal)
            new_y_bot = bounding_box[1][1]+int(scale_vertical)
            
            # To prevent error on -ve values
            if(new_x_top < 0):
                new_x_top = 0
            if(new_y_top < 0):
                new_y_top = 0
            if(new_x_bot < 0):
                new_x_bot = 0
            if(new_y_bot < 0):
                new_y_bot = 0
                
            # Re-calculate bounding lines
            new_width = new_x_bot - new_x_top
            new_height = new_y_bot - new_y_top
            
            # if condition pass, crop image and output
            X, Y, W, H = new_x_top, new_y_top, new_width, new_height
            cropped_image = result[Y:Y+H, X:X+W]
            
            output_test_img = path + "/articles/page" + str(pageNum) + "_"+ str(task) +"_" + str(count) +".png"
            cv2.imwrite(output_test_img, cropped_image)
            cv2.rectangle(result, (new_x_top,new_y_top), (new_x_bot,new_y_bot), (0, 0, 255), 2)
    cv2.imwrite(output_path+"/page"+str(pageNum) + "_part"+str(task)+".png" , result)
            
# Main image processing method to draw bounding lines of graphs/charts
def isLandscape(h,w):
    if(w>h):
        print("landscape")
        return True
    else:
        print("portrait")
        return False
    
def page_to_articles(pageNum, fileHeader):
    print(f"=>starting on page {pageNum} : {fileHeader}")
    img_list = []
    img = cv2.imread(fileHeader, cv2.IMREAD_COLOR) # Identify img
    h,w,c = img.shape
    print (h, w, c)
    if(isLandscape(h,w)):
        width_cutoff = w // 2
        s1 = img[:, :width_cutoff]
        s2 = img[:, width_cutoff:]        
        process_image(s1, pageNum, "a")
        process_image(s2, pageNum, "b")
    else:
        process_image(img, pageNum,"0")


# TODO: filter page with relevant ESG keywords
def filter_pages(pageNum):
    if(pageNum%2==0):
        return True
    else:
        return False

In [118]:
counter = 0
for filename in os.listdir(source_path):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        counter = counter + 1
        print(f"{counter} : {os.path.join(source_path, filename)}")
        imagepath = os.path.join(source_path, filename)
        
        # Read first image to check orientation
        img = cv2.imread(imagepath, cv2.IMREAD_COLOR)
        h,w,c = img.shape
        #print (h, w, c)
        
        # Check orientation
        if(w>h):
            print("landscape")
            # Split image into 2 
            width_cutoff = w // 2
            s1 = img[:, :width_cutoff]
            s2 = img[:, width_cutoff:]
            page_to_articles(counter,imagepath)
            counter = counter + 1
            page_to_articles(counter,imagepath)
        else:
            print("portrait")
            page_to_articles(counter,imagepath)
            # run extraction
        
    else:
        continue
        
print("JOB COMPLETE+++++++++")

1 : ChartExtraction_Output/Citibank_nan/pages/9.png
portrait
=>starting on page 1 : ChartExtraction_Output/Citibank_nan/pages/9.png
2339 1654 3
portrait
2 : ChartExtraction_Output/Citibank_nan/pages/17.png
portrait
=>starting on page 2 : ChartExtraction_Output/Citibank_nan/pages/17.png
2339 1654 3
portrait
3 : ChartExtraction_Output/Citibank_nan/pages/16.png
portrait
=>starting on page 3 : ChartExtraction_Output/Citibank_nan/pages/16.png
2339 1654 3
portrait
4 : ChartExtraction_Output/Citibank_nan/pages/10.png
portrait
=>starting on page 4 : ChartExtraction_Output/Citibank_nan/pages/10.png
2339 1654 3
portrait
5 : ChartExtraction_Output/Citibank_nan/pages/21.png
portrait
=>starting on page 5 : ChartExtraction_Output/Citibank_nan/pages/21.png
2339 1654 3
portrait
6 : ChartExtraction_Output/Citibank_nan/pages/20.png
portrait
=>starting on page 6 : ChartExtraction_Output/Citibank_nan/pages/20.png
2339 1654 3
portrait
7 : ChartExtraction_Output/Citibank_nan/pages/22.png
portrait
=>starting

## 5.Run Program

In [None]:
    
            filter_img = cv2.imread(image_path, cv2.IMREAD_COLOR)        
    # list of keywords
            dictionary = ["carbon","co2","environment","GHG emissions","Greenhouse Gas",
                                     "carbon footprint","carbon emissions","Scope 1","Scope 2",
                                     "Scope 3", "WACI","Carbon Intensity","carbon pricing","net-zero",
                                     "metrics and targets","TCFD","sustainability goals","decarbonisation",
                                     "climate",'energy','emission', 'emissions', 'renewable', 'carbon', 'fuel', 'power', 
                                     'green', 'gas', 'green energy', 'sustainable', 'climate', 'sustainability', 
                                     'environmental', 'environment', 'GHG','decarbon', 'energy consumption', 
                                     'paper consumption','water consumption', 'carbon intensity', 'waste management', 
                                     'electricity consumption', 'cdp', 'global warming', 'business travel', 
                         ]


            # list of units
            units = ['tonnes', 'tons', 'kWh', ' kg ', 'kilogram', 'kilowatt hour', 
                   'gigajoules', 'GJ', 'litre', 'liter', 'CO2e', 'tCO', 't CO', 'MWh', 
                   'megawatt hour', '%', 'cubic metres', 'per employee']

            # Filter images with too much text
            text = pytesseract.image_to_string(filter_img)
            list_of_text = text.split()
            #print(list_of_text)
            # Remove numbers
            
            n = 0
            if any(keyword in text.lower() for keyword in dictionary) and any(unit in text for unit in units):
                print(f"============FOUND KEYWORD {pageNum}_{count}_{n}===================")
                if(10 < len(list_of_text) < 150):
                    # Output result
                    n = n + 1
                    print(f"============FOUND {pageNum}_{count}_{n}, LENGTH = {len(list_of_text)}===================")
                    ROI_image_path = fileHeader + "/final_page_" +str(pageNum)+ "_"+ str(count) + "_"+ str(n) + "_ROI.png"
                    cv2.imwrite(ROI_image_path, filter_img)
                    img_list.append(ROI_image_path)
                else:
                    print(f"{image_path} = {len(list_of_text)}")
            else:
                print(f"============REJECTED {pageNum}_{count}_{n}, LENGTH = {len(list_of_text)}===================")
    return img_list

In [None]:
def chart_extraction(url, pages, copy_to_path):
# check if URL is pdf
    if ".pdf" not in url:
        print("URL is not a PDF.")
        return "nan"

    try:
        response = requests.get(url)
    except:
        print("Requests failed.")
        return "nan"
    
    os.makedirs(copy_to_path)
    os.makedirs(copy_to_path + '/pages') # Create dir for page output
    os.makedirs(copy_to_path + '/articles') # Create dir for chart extraction output
    os.makedirs(copy_to_path + '/test') # Create dir for test outputs
    
    # convert pdf URL to image
    images = convert_from_bytes(response.content, size=1000)

    i = 1
    image_path_obj = {}
    
    for page in filtered_list:
        json_imgs = []
        print(f"==> Now doing page {page} ...")
        img_list = page_to_articles(page, copy_to_path)
        print(f"==> Finished Page {page} ...\n")
        json_imgs.append(img_list)    
        image_path_obj[str(page)] = json_imgs
    
    # remove program folders
    os.remove(copy_to_path + '/pages') # Create dir for page output
    os.remove(copy_to_path + '/articles') # Create dir for chart extraction output
    os.remove(copy_to_path + '/test') # Create dir for test outputs


In [None]:
# Opening JSON file
f = open('all_asian_banks_preprocessed_vfinal.json',)
 
# returns JSON object as a dictionary
data = json.load(f)
 
# Iterating through the json list
source = "gdrive/MyDrive/TableExtraction_Output/"

json_lst = []

for i in data[:5]:
    company = i['company']
    year = i['year']
    pdf_url = i['url']
    pages = []
    for j in i['filtered_report_tables_direct']:
        pages.append(j)
    
    json_obj = {}
    json_obj['company'] = company
    json_obj['year'] = year
    json_obj['pdf_url'] = pdf_url

    path = source + company + '_' + year
    os.mkdir(path)
    try:
        json_obj['images_path'] = chart_extraction(pdf_url, pages, path)
    except:
        print("Error occurred in chart_extraction")
        json_obj['images_path'] = "nan"

    json_lst.append(json_obj)
 

 
# Closing file
f.close()