# Chart Extraction 
## 1. Install all the prerequisite 

In [None]:
!pip install pdf2image
!pip install pytesseract
!pip install opencv-python
!python --version

## 2. Import relevant packages

In [78]:
# Use for checking files in dir
import os

# Extract each pdf page to image
from pdf2image import convert_from_path, convert_from_bytes 
from pdf2image.exceptions import (
 PDFInfoNotInstalledError,
 PDFPageCountError,
 PDFSyntaxError
)

# Image Processing
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import pytesseract
import json
import requests
from pathlib import Path
import re
import datetime

## 2. Program Methods

In [79]:
# Optimal parameters for graph detection
set_column_gap = 50
set_height_limit = 180
set_width_limit = 180
set_area_limit = 100000

set_scale_factor = 3.2 # Extend horizontal and vertical axis of bounding boxes
scale_horizontal = set_scale_factor*64
scale_vertical = set_scale_factor*64

def zero_runs(a):
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

# Method to check if extracted article is valid
def checkDim(height, area):
    #print(f"{height} x {width} = {area}")
    if(height <= set_height_limit):
        return False
    if(area <= set_area_limit):
        return False
    return True

def process_Num(text):
    no_punct = re.sub('[^0-9\n\.]', ' ', text)
    res = no_punct.split()
    my_list = set(res)  
    to_delete = ["."]
    
    my_list.difference_update(to_delete)
    results = list(my_list)
    return results

def process_text(text):
    no_punct = re.sub('[^a-zA-Z\n\.]', ' ', text)
    res = no_punct.split()
    my_list = set(res)  
    to_delete = ["."]
    
    my_list.difference_update(to_delete)
    results = list(my_list)
    return results

def find_keywords(text):
    no_punct = re.sub('[^a-zA-Z0-9\n\.]', ' ', text)
    # list of keywords
    dictionary = ['carbon','ghg','emission',
                 'emissions',"scope", "WACI","net-zero",
                 'energy','water','waste','coal','power','green','paper','consumption','renewable',
                 'breakdown','loans','tonnes', 'tons', 'kWh', 'kg', 'kilogram', 'kilowatt hour', 
                   'gigajoules', 'GJ', 'litre', 'liter', 'CO2e', 'tCO', 't CO', 'MWh', 
                   'megawatt hour', '%', 'cubic metres', 'per employee','m3', 'co2','o2'
                    ,'million', 'total','trillion','set'
               ]
    res = set(no_punct.lower().split())
    newlength = len(res)
    res.difference_update(dictionary)
    results = list(res)
    value = newlength - len(results) 
    return value

def count_clean_text(text):
    no_punct = re.sub('[^a-zA-Z\n\.]', ' ', text)
    res = no_punct.split()
    return len(res)

def count_clean_num(text):
    no_punct = re.sub('[^0-9\n\.]', ' ', text)
    res = no_punct.split()
    return len(res)

def getTotalLen(text):
    no_punct = re.sub('[^a-zA-Z0-9\n\.]', ' ', text) # remove punctuations
    res = no_punct.split()
    return len(res)

def filter_relevance(filter_img):
    try:
        text = pytesseract.image_to_string(filter_img)
    except: 
        print("ERROR at Textserract")
        return
    
    # Filter images with too much text
    # Total length of text
    total_len = getTotalLen(text)
    
    # Filter images with too little or no keywords
    # Total unique ketywords found
    keywords = find_keywords(text)
    
    # Get total unique string
    textonly_len = len(process_text(text))
    
    # Get total unique numbers/digits
    numonly_len = len(process_Num(text))
    
    # Get text to total text ratio = clean_text/total text
    try:
        tt_ratio = count_clean_text(text)/total_len
    except:
        tt_ratio = 0
        
    # Basic shape descriptive data
    height = filter_img.shape[0] 
    width = filter_img.shape[1]
    channels = filter_img.shape[2]# number of components used to represent each pixel.
    area = height * width

    # num to area ratio = total num to whole image 
    na_ratio = count_clean_num(text)/area * 10**5
    
    # Color processing
    # Filter by BW instead of colors, cause too similar
    gray = cv2.cvtColor(filter_img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,2)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
    dilate = cv2.dilate(thresh, kernel, iterations=1)
    # Count total white and black ratio
    white_pix = np.sum(dilate == 255)
    black_pix = np.sum(dilate == 0)
    # b/w ratio
    bw_ratio = black_pix/white_pix
    
    if(keywords>0 and tt_ratio < 0.99 and bw_ratio>14 and textonly_len<90 and numonly_len>1 and na_ratio>0.45 ):
        return True
    else:
        return False

def filterImage(img): 
    try:
        text = pytesseract.image_to_string(img)
    except: 
        print("ERROR at Textserract")
        return
    
    # Filter images with too much text
    # Total length of text
    total_len = getTotalLen(text)
    
    # Basic shape descriptive data
    height = img.shape[0] 
    width = img.shape[1]
    channels = img.shape[2]# number of components used to represent each pixel.
    area = height * width
    # text to area ratio = total text to whole image 
    ta_ratio = count_clean_text(text)/area * 10**5
    
    # Color processing
    # Filter by BW instead of colors, cause too similar
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,2)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
    dilate = cv2.dilate(thresh, kernel, iterations=1)
    dilate_raw = cv2.mean(dilate)[::-1]
    dilated_region = int(dilate_raw[3])
    
    # Count total white pixels
    white_pix = np.sum(dilate == 255)
    
    # region dilated + area captured
    if((7 < dilated_region<26)and (7000<white_pix<90000) and total_len < 68 and ta_ratio <10):
        return True
    else:
        return False
    

def scale_image(x_top, y_top, x_bot, y_bot ,result):

    # Extend bounding lines
    new_x_top = x_top -int(scale_horizontal)
    new_y_top = y_top -int(scale_vertical)
    new_x_bot = x_bot +int(scale_horizontal)
    new_y_bot = y_bot +int(scale_vertical)

    # To prevent error on -ve values
    if(new_x_top < 0):
        new_x_top = 0
    if(new_y_top < 0):
        new_y_top = 0
    if(new_x_bot < 0):
        new_x_bot = 0
    if(new_y_bot < 0):
        new_y_bot = 0

    # Re-calculate bounding lines
    new_width = new_x_bot - new_x_top
    new_height = new_y_bot - new_y_top

    # if condition pass, crop image and output
    X, Y, W, H = new_x_top, new_y_top, new_width, new_height

    cropped_image = result[Y:Y+H, X:X+W] 

    return cropped_image
    
def process_image(img, pageNum, task,  img_list, fileHeader):
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # turn img to grey
    img_gray_inverted = 255 - img_gray # Invert back to normal

    row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
    row_gaps = zero_runs(row_means)
    row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
    
    bounding_boxes = []
    for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
        line = img[int(start):int(end)]
        line_gray_inverted = img_gray_inverted[int(start):int(end)]

        column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
        column_gaps = zero_runs(column_means)
        column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
        column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2

        filtered_cutpoints = column_cutpoints[column_gap_sizes > set_column_gap]

        for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
            bounding_boxes.append(((int(xstart), int(start)), (int(xend), int(end))))

    count = 0
    result = img.copy()

    for bounding_box in bounding_boxes:
        count = count + 1 # count number of images extracted

        x_top = bounding_box[0][0]
        y_top = bounding_box[0][1]
        x_bot = bounding_box[1][0]
        y_bot = bounding_box[1][1]
        
        height = y_bot - y_top # height of image extracted
        width = x_bot - x_top # width of image extracted
        area = height * width # area of image extracted
        
        X, Y, W, H = x_top, y_top, width, height
        # Primary Filtering
        if(checkDim(height, area) == True):
            print(f"============ PASSED for AREA FILTER {pageNum}_{task}_{count}===================")
            # if condition pass, crop image and output

            cropped_image = result[Y:Y+H, X:X+W]

            article_output_file = fileHeader + "/filter_1/page" + str(pageNum) + "_"+str(task)+"_" +str(count) +".png"
            
            cv2.imwrite(article_output_file, cropped_image)
            
            # For data collection
            # article_output_file_2 = "ChartExtraction_Output/out/filter_1/page" + str(pageNum) + "_"+str(task)+"_" +str(count) +".png"
            # cv2.imwrite(article_output_file_2, cropped_image)

            # Black-White Filtering 
            filter_img = cv2.imread(article_output_file, cv2.IMREAD_COLOR)
            if filterImage(filter_img) == True: 
                scale_image_size = scale_image( x_top, y_top, x_bot, y_bot ,result) 
                
                article_output_file = fileHeader + "/filter_2/page" + str(pageNum) + "_"+str(task)+"_" +str(count) +".png"
                try:
                    cv2.imwrite(article_output_file, scale_image_size)
                except:
                    print("ERROR at Black-White Filter")
                    print(scale_image_size)
                filter_img = cv2.imread(article_output_file, cv2.IMREAD_COLOR)
                # Relevance Filtering
                if filter_relevance(filter_img) == True: 
                    print(f"============ ACCEPTED {pageNum}_{task}_{count}===================") 
                    ROI_image_path = fileHeader + "/ROI_" +str(pageNum)+ "_"+ str(task) + "_"+ str(count) + ".png"
                    cv2.imwrite(ROI_image_path, filter_img)
                    img_list.append(ROI_image_path)
                else:
                    print(f"============ REJECTED at RELEVANCE FILTER {pageNum}_{task}_{count}===================")
            else:
                print(f"============ REJECTED for COLOR FILTER {pageNum}_{task}_{count}===================") 
        else:
            print(f"============ REJECTED for AREA FILTER {pageNum}_{task}_{count}===================")

    return img_list

def isLandscape(h,w):
    if(w>h):
        print("landscape")
        return True
    else:
        print("portrait")
        return False
    
def page_to_articles(pageNum, fileHeader):
    print(f"=>starting on page {pageNum} : {fileHeader}")
    img_list = []
    img = cv2.imread(fileHeader+'/pages/%s.png' %pageNum, cv2.IMREAD_COLOR) # Identify img
    h,w,c = img.shape
    if(isLandscape(h,w) == True):
        width_cutoff = w // 2
        s1 = img[:, :width_cutoff]
        s2 = img[:, width_cutoff:]        
        process_image(s1, pageNum, "a", img_list, fileHeader)
        process_image(s2, pageNum, "b", img_list, fileHeader)
    else:
        process_image(img, pageNum,"0", img_list, fileHeader)
        
    return img_list
    
def chart_extraction(url, pages, copy_to_path):
    # check if URL is pdf
    if ".pdf" not in url:
        print("URL is not a PDF.")
        return "nan"

    try:
        response = requests.get(url)
    except:
        print("Requests failed.")
        return "nan"
    
    i = 1
    image_path_obj = {}
    
    # Check if pdf is landscape or portrait
    
    
    # Convert relevant pages to images for processing 
    response = requests.get(url, timeout=30)
    images = convert_from_bytes(response.content)
       
    for i, image in enumerate(images):
        target_file = copy_to_path+"/pages"# Create dir for page output
        if not os.path.exists(target_file):
            # file exists
            os.mkdir(target_file)
            
        if str(i) not in pages:
            continue
        print(f"==> Convert page {i} of pdf to image...")
        image.save(f"{target_file}/{str(i)}.png")
        
    target_file_a = copy_to_path+"/filter_1"# Create dir for chart extraction output
    if not os.path.exists(target_file_a):
        # file does not exists 
        os.makedirs(target_file_a) 
        
    target_file_b = copy_to_path+"/filter_2"# Create dir for test outputs 
    if not os.path.exists(target_file_b):
        # file does not exists 
        os.makedirs(target_file_b)  
    try: 
        for page in pages:
            print(f"==> Now doing page {page} ...")
            img_list = page_to_articles(page, copy_to_path)
            print(f"==> Finished Page {page} ...\n")
            image_path_obj[str(page)] = img_list
    except Exception as e:
        print(f"ERROR at page_to_article: ")
        if hasattr(e, 'message'):
            print(e.message)
        else:
            print(e)
        image_path_obj[str(page)] = "nan"

    return image_path_obj


In [80]:
# Opening JSON file
#test_set = open('preprocessed/test_set.json',)

def run_extraction_main(img_path, out_folder):
    f = open(img_path,)

    data = json.load(f)

    source = out_folder + "/"
    if not os.path.exists(source):
        # file exists
        os.mkdir(source)

    json_lst = []
    # x_reports to look at for testing purposes.
    x_report = 50
    for i in data[:x_report]: # Load first x_reports from json
        company = i['company']
        year = i['year']
        pdf_url = i['url']
        pages = []
        for j in i['filtered_report_tables_direct']:
            pages.append(j)

        json_obj = {}
        json_obj['company'] = company
        json_obj['year'] = year
        json_obj['pdf_url'] = pdf_url

        path = source + company + '_' + year

        if not os.path.exists(path):
            # file exists
            os.mkdir(path)

        try:
            json_obj['images_path'] = chart_extraction(pdf_url, pages, path)
        except Exception as e:
            print(f"ERROR at chart_extraction: ")
            if hasattr(e, 'message'):
                print(e.message)
            else:
                print(e)
            json_obj['images_path'] = "nan"

        json_lst.append(json_obj)
        print(f"++++++++++ Report done ++++++++++")

    print(f"================================ JOB COMPLETE =======================================")

    # Closing file
    f.close()

## 3. Starting point of program
- Before you start, create a folder in your current dir to store your output. (For e.g ChartExtraction_Output)
- Ensure you have enough space to store images to run. (About >3gb is recommended for all FI, >1gb for one FI)
- The program will create folders and input the image inside for processing.
- use 'ls' below to check your local dir now and check the folder you create is in it.

In [81]:
out_folder = "ChartExtraction_Output" # name of your folder output
if not os.path.exists(out_folder):
    os.mkdir(out_folder)
else:
    print("===> Folder exist")

===> Folder exist


In [82]:
ls

[34mChartExtraction_Output[m[m/          Precision and Recall.xlsx
[34mChartExtraction_Output_old[m[m/      base.csv
[34mData[m[m/                            chart_output.json
OPENCV_Bounding_Example.ipynb    filter_tester.ipynb
OPENCV_Contouring_Example.ipynb  [34mpreprocessed[m[m/
PDF_to_Charts.ipynb              results_filter1a.csv
PDF_to_Charts_tester.ipynb       ~$Precision and Recall.xlsx


In [83]:
# IMPORTANT 
# MAIN INPUT: Can only read .json file that XM created (For e.g all_asian_banks_preprocessed.json )
# MAIN OUTPUT: ROI images
# Please specify data source (Only .json format with same structure)

data_source_file = "preprocessed/" # Replace with .json type data source
for filename in os.listdir(data_source_file):
    if filename.endswith(".json"):
        img_path = os.path.join(data_source_file, filename)
        run_extraction_main(img_path, out_folder)


==> Convert page 5 of pdf to image...
==> Convert page 51 of pdf to image...
==> Convert page 52 of pdf to image...
==> Convert page 69 of pdf to image...
==> Now doing page 5 ...
=>starting on page 5 : ChartExtraction_Output/Hang Seng Investment Management_2019
landscape
==> Finished Page 5 ...

==> Now doing page 51 ...
=>starting on page 51 : ChartExtraction_Output/Hang Seng Investment Management_2019
landscape
==> Finished Page 51 ...

==> Now doing page 52 ...
=>starting on page 52 : ChartExtraction_Output/Hang Seng Investment Management_2019
landscape
==> Finished Page 52 ...

==> Now doing page 69 ...
=>starting on page 69 : ChartExtraction_Output/Hang Seng Investment Management_2019
landscape
==> Finished Page 69 ...

++++++++++ Report done ++++++++++
==> Convert page 46 of pdf to image...
==> Convert page 47 of pdf to image...
==> Convert page 49 of pdf to image...
==> Now doing page 46 ...
=>starting on page 46 : ChartExtraction_Output/CMBC Capital_2017
portrait
==> Finished 

portrait
==> Finished Page 47 ...

==> Now doing page 49 ...
=>starting on page 49 : ChartExtraction_Output/CMBC Capital_2017
portrait
==> Finished Page 49 ...

++++++++++ Report done ++++++++++
URL is not a PDF.
++++++++++ Report done ++++++++++
==> Convert page 7 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 37 of pdf to image...
==> Convert page 38 of pdf to image...
==> Convert page 54 of pdf to image...
==> Convert page 60 of pdf to image...
==> Convert page 61 of pdf to image...
==> Convert page 66 of pdf to image...
==> Convert page 67 of pdf to image...
==> Convert page 68 of pdf to image...
==> Convert page 76 of pdf to image...
==> Convert page 78 of pdf to image...
==> Convert page 80 of pdf to image...
==> Convert page 88 of pdf to image...
==> Convert page 92 of pdf to image...
==> Convert page 96 of pdf to image...
==> Convert page 97 of pdf to image...
==> Convert page 98 of pdf to image...
==> Convert page 101 of pdf to image...
==> Convert 

portrait
==> Finished Page 27 ...

==> Now doing page 37 ...
=>starting on page 37 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 37 ...

==> Now doing page 38 ...
=>starting on page 38 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 38 ...

==> Now doing page 54 ...
=>starting on page 54 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 54 ...

==> Now doing page 60 ...
=>starting on page 60 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 60 ...

==> Now doing page 61 ...
=>starting on page 61 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 61 ...

==> Now doing page 66 ...
=>starting on page 66 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 66 ...

==> Now doing page 67 ...
=>starting on page 67 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 67 ...

==> Now doing page 68 ...
=>starting on page 68 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 68 ...

==> Now doing page 76 ...
=>starting on page 76 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 76 ...

==> Now doing page 78 ...
=>starting on page 78 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 78 ...

==> Now doing page 80 ...
=>starting on page 80 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 80 ...

==> Now doing page 88 ...
=>starting on page 88 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 88 ...

==> Now doing page 92 ...
=>starting on page 92 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 92 ...

==> Now doing page 96 ...
=>starting on page 96 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 96 ...

==> Now doing page 97 ...
=>starting on page 97 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 97 ...

==> Now doing page 98 ...
=>starting on page 98 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 98 ...

==> Now doing page 101 ...
=>starting on page 101 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 101 ...

==> Now doing page 124 ...
=>starting on page 124 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 124 ...

==> Now doing page 130 ...
=>starting on page 130 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 130 ...

==> Now doing page 131 ...
=>starting on page 131 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 131 ...

==> Now doing page 135 ...
=>starting on page 135 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 135 ...

==> Now doing page 141 ...
=>starting on page 141 : ChartExtraction_Output/UBS Asset Management_2019
landscape
==> Finished Page 141 ...

==> Now doing page 142 ...
=>starting on page 142 : ChartExtraction_Output/UBS Asset Management_2019
landscape
==> Finished Page 142 ...

==> Now doing page 146 ...
=>starting on page 146 : ChartExtraction_Output/UBS Asset Management_2019
landscape
==> Finished Page 146 ...

==> Now doing page 148 ...
=>starting on page 148 : ChartExtraction_Output/UBS Asset Management_2019
portrait
==> Finished Page 148 ...

==> Now doing page 170 ...
=>starting on page 170 : ChartExtraction_Output/UBS Asset Management_2019
portrait


==> Finished Page 170 ...

++++++++++ Report done ++++++++++
++++++++++ Report done ++++++++++
++++++++++ Report done ++++++++++
ERROR at chart_extraction: 
Unable to get page count.
Syntax Error (2): Illegal character <21> in hex string
Syntax Error (4): Illegal character <4f> in hex string
Syntax Error (6): Illegal character <54> in hex string
Syntax Error (7): Illegal character <59> in hex string
Syntax Error (8): Illegal character <50> in hex string
Syntax Error (11): Illegal character <68> in hex string
Syntax Error (12): Illegal character <74> in hex string
Syntax Error (13): Illegal character <6d> in hex string
Syntax Error (14): Illegal character <6c> in hex string
Syntax Error (17): Illegal character <68> in hex string
Syntax Error (18): Illegal character <74> in hex string
Syntax Error (19): Illegal character <6d> in hex string
Syntax Error (20): Illegal character <6c> in hex string
Syntax Error (22): Illegal character <6c> in hex string
Syntax Error (24): Illegal character <

==> Finished Page 44 ...

==> Now doing page 63 ...
=>starting on page 63 : ChartExtraction_Output/Value Partners_2020
portrait
==> Finished Page 63 ...

==> Now doing page 70 ...
=>starting on page 70 : ChartExtraction_Output/Value Partners_2020
portrait
==> Finished Page 70 ...

==> Now doing page 71 ...
=>starting on page 71 : ChartExtraction_Output/Value Partners_2020
portrait


==> Finished Page 71 ...

++++++++++ Report done ++++++++++
ERROR at chart_extraction: 
Unable to get page count.
Syntax Error (2): Illegal character <68> in hex string
Syntax Error (3): Illegal character <74> in hex string
Syntax Error (4): Illegal character <6d> in hex string
Syntax Error (5): Illegal character <6c> in hex string
Syntax Error (10): Illegal character <68> in hex string
Syntax Error (16): Illegal character <74> in hex string
Syntax Error (17): Illegal character <69> in hex string
Syntax Error (18): Illegal character <74> in hex string
Syntax Error (19): Illegal character <6c> in hex string
Syntax Error (36): Illegal character <2f> in hex string
Syntax Error (37): Illegal character <74> in hex string
Syntax Error (38): Illegal character <69> in hex string
Syntax Error (39): Illegal character <74> in hex string
Syntax Error (40): Illegal character <6c> in hex string
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error

==> Finished Page 46 ...

==> Now doing page 48 ...
=>starting on page 48 : ChartExtraction_Output/Pictet Asset Management_2021
portrait


==> Finished Page 48 ...

++++++++++ Report done ++++++++++
==> Convert page 15 of pdf to image...
==> Convert page 61 of pdf to image...
==> Convert page 62 of pdf to image...
==> Convert page 63 of pdf to image...
==> Convert page 64 of pdf to image...
==> Convert page 89 of pdf to image...
==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/Hang Seng Investment Management_2015
landscape
==> Finished Page 15 ...

==> Now doing page 61 ...
=>starting on page 61 : ChartExtraction_Output/Hang Seng Investment Management_2015
landscape
==> Finished Page 61 ...

==> Now doing page 62 ...
=>starting on page 62 : ChartExtraction_Output/Hang Seng Investment Management_2015
landscape
==> Finished Page 62 ...

==> Now doing page 63 ...
=>starting on page 63 : ChartExtraction_Output/Hang Seng Investment Management_2015
landscape
==> Finished Page 63 ...

==> Now doing page 64 ...
=>starting on page 64 : ChartExtraction_Output/Hang Seng Investment Management_2015
landscape
==

==> Finished Page 5 ...

==> Now doing page 6 ...
=>starting on page 6 : ChartExtraction_Output/BlackRock_2020
landscape
==> Finished Page 6 ...

==> Now doing page 8 ...
=>starting on page 8 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 8 ...

==> Now doing page 12 ...
=>starting on page 12 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 12 ...

==> Now doing page 14 ...
=>starting on page 14 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 14 ...

==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 15 ...

==> Now doing page 16 ...
=>starting on page 16 : ChartExtraction_Output/BlackRock_2020
portrait


==> Finished Page 16 ...

==> Now doing page 20 ...
=>starting on page 20 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 20 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 27 ...

==> Now doing page 28 ...
=>starting on page 28 : ChartExtraction_Output/BlackRock_2020
portrait


==> Finished Page 28 ...

==> Now doing page 29 ...
=>starting on page 29 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 29 ...

==> Now doing page 34 ...
=>starting on page 34 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 34 ...

==> Now doing page 35 ...
=>starting on page 35 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 35 ...

==> Now doing page 36 ...
=>starting on page 36 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 36 ...

==> Now doing page 37 ...
=>starting on page 37 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 37 ...

==> Now doing page 39 ...
=>starting on page 39 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 39 ...

==> Now doing page 41 ...
=>starting on page 41 : ChartExtraction_Output/BlackRock_2020
portrait


==> Finished Page 41 ...

==> Now doing page 46 ...
=>starting on page 46 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 46 ...

==> Now doing page 47 ...
=>starting on page 47 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 47 ...

==> Now doing page 48 ...
=>starting on page 48 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 48 ...

==> Now doing page 49 ...
=>starting on page 49 : ChartExtraction_Output/BlackRock_2020
portrait
==> Finished Page 49 ...

++++++++++ Report done ++++++++++


==> Convert page 45 of pdf to image...
==> Convert page 47 of pdf to image...
==> Convert page 48 of pdf to image...
==> Convert page 71 of pdf to image...
==> Convert page 72 of pdf to image...
==> Convert page 73 of pdf to image...
==> Convert page 78 of pdf to image...
==> Convert page 79 of pdf to image...
==> Convert page 111 of pdf to image...
==> Now doing page 45 ...
=>starting on page 45 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 45 ...

==> Now doing page 47 ...
=>starting on page 47 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 47 ...

==> Now doing page 48 ...
=>starting on page 48 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 48 ...

==> Now doing page 71 ...
=>starting on page 71 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait


==> Finished Page 71 ...

==> Now doing page 72 ...
=>starting on page 72 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 72 ...

==> Now doing page 73 ...
=>starting on page 73 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 73 ...

==> Now doing page 78 ...
=>starting on page 78 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait


==> Finished Page 78 ...

==> Now doing page 79 ...
=>starting on page 79 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 79 ...

==> Now doing page 111 ...
=>starting on page 111 : ChartExtraction_Output/Goldman Sachs Asset Management_2020
portrait
==> Finished Page 111 ...

++++++++++ Report done ++++++++++
==> Convert page 28 of pdf to image...
==> Convert page 32 of pdf to image...
==> Convert page 50 of pdf to image...
==> Now doing page 28 ...
=>starting on page 28 : ChartExtraction_Output/CMBC Capital_2020
portrait
==> Finished Page 28 ...

==> Now doing page 32 ...
=>starting on page 32 : ChartExtraction_Output/CMBC Capital_2020
portrait
==> Finished Page 32 ...

==> Now doing page 50 ...
=>starting on page 50 : ChartExtraction_Output/CMBC Capital_2020
portrait
==> Finished Page 50 ...

++++++++++ Report done ++++++++++
==> Convert page 33 of pdf to image...
==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/E Fund M

==> Finished Page 47 ...

==> Now doing page 54 ...
=>starting on page 54 : ChartExtraction_Output/PineBridge Investments_2020
portrait
==> Finished Page 54 ...

==> Now doing page 57 ...
=>starting on page 57 : ChartExtraction_Output/PineBridge Investments_2020
portrait
==> Finished Page 57 ...

==> Now doing page 58 ...
=>starting on page 58 : ChartExtraction_Output/PineBridge Investments_2020
portrait
==> Finished Page 58 ...

==> Now doing page 77 ...
=>starting on page 77 : ChartExtraction_Output/PineBridge Investments_2020
portrait


==> Finished Page 77 ...

++++++++++ Report done ++++++++++
==> Convert page 5 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert page 17 of pdf to image...
==> Convert page 33 of pdf to image...
==> Convert page 41 of pdf to image...
==> Now doing page 5 ...
=>starting on page 5 : ChartExtraction_Output/First State Investments_2020
portrait
==> Finished Page 5 ...

==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/First State Investments_2020
portrait
==> Finished Page 15 ...

==> Now doing page 17 ...
=>starting on page 17 : ChartExtraction_Output/First State Investments_2020
portrait
==> Finished Page 17 ...

==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/First State Investments_2020
portrait
==> Finished Page 33 ...

==> Now doing page 41 ...
=>starting on page 41 : ChartExtraction_Output/First State Investments_2020
portrait
==> Finished Page 41 ...

++++++++++ Report done ++++++++++
URL is not a PDF.
++++++++++ R

==> Finished Page 5 ...

==> Now doing page 6 ...
=>starting on page 6 : ChartExtraction_Output/Fidelity International_2021
portrait
==> Finished Page 6 ...

==> Now doing page 7 ...
=>starting on page 7 : ChartExtraction_Output/Fidelity International_2021
portrait
==> Finished Page 7 ...

==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/Fidelity International_2021
portrait
==> Finished Page 15 ...

++++++++++ Report done ++++++++++


==> Convert page 25 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 29 of pdf to image...
==> Convert page 33 of pdf to image...
==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/BOCOM International Asset Management_2020
portrait
==> Finished Page 25 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/BOCOM International Asset Management_2020
portrait
==> Finished Page 27 ...

==> Now doing page 29 ...
=>starting on page 29 : ChartExtraction_Output/BOCOM International Asset Management_2020
portrait
==> Finished Page 29 ...

==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/BOCOM International Asset Management_2020
portrait
==> Finished Page 33 ...

++++++++++ Report done ++++++++++
==> Convert page 23 of pdf to image...
==> Convert page 24 of pdf to image...
==> Convert page 25 of pdf to image...
==> Now doing page 23 ...
=>starting on page 23 : ChartExtraction_Output/CMBC Capital_2016
po

==> Finished Page 23 ...

==> Now doing page 24 ...
=>starting on page 24 : ChartExtraction_Output/CMBC Capital_2016
portrait
==> Finished Page 24 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/CMBC Capital_2016
portrait
==> Finished Page 25 ...

++++++++++ Report done ++++++++++
URL is not a PDF.
++++++++++ Report done ++++++++++
==> Convert page 65 of pdf to image...
==> Convert page 66 of pdf to image...
==> Now doing page 65 ...
=>starting on page 65 : ChartExtraction_Output/Hang Seng Investment Management_2018
landscape
==> Finished Page 65 ...

==> Now doing page 66 ...
=>starting on page 66 : ChartExtraction_Output/Hang Seng Investment Management_2018
landscape
==> Finished Page 66 ...

++++++++++ Report done ++++++++++
==> Convert page 13 of pdf to image...
==> Convert page 14 of pdf to image...
==> Convert page 16 of pdf to image...
==> Convert page 18 of pdf to image...
==> Convert page 19 of pdf to image...
==> Convert page 20 of pdf to image..

==> Finished Page 16 ...

==> Now doing page 18 ...
=>starting on page 18 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 18 ...

==> Now doing page 19 ...
=>starting on page 19 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 19 ...

==> Now doing page 20 ...
=>starting on page 20 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 20 ...

==> Now doing page 21 ...
=>starting on page 21 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 21 ...

==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 22 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait


==> Finished Page 25 ...

==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 33 ...

==> Now doing page 34 ...
=>starting on page 34 : ChartExtraction_Output/J.P.Morgan Asset Management_2019
portrait
==> Finished Page 34 ...

++++++++++ Report done ++++++++++


==> Convert page 43 of pdf to image...
==> Convert page 54 of pdf to image...
==> Convert page 79 of pdf to image...
==> Now doing page 43 ...
=>starting on page 43 : ChartExtraction_Output/Lazard Asset Management_2020
portrait
==> Finished Page 43 ...

==> Now doing page 54 ...
=>starting on page 54 : ChartExtraction_Output/Lazard Asset Management_2020
portrait
==> Finished Page 54 ...

==> Now doing page 79 ...
=>starting on page 79 : ChartExtraction_Output/Lazard Asset Management_2020
portrait
==> Finished Page 79 ...

++++++++++ Report done ++++++++++


++++++++++ Report done ++++++++++
==> Convert page 26 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 28 of pdf to image...
==> Convert page 29 of pdf to image...
==> Convert page 34 of pdf to image...
==> Convert page 41 of pdf to image...
==> Now doing page 26 ...
=>starting on page 26 : ChartExtraction_Output/T.Rowe Price_2020
portrait
==> Finished Page 26 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/T.Rowe Price_2020
portrait
==> Finished Page 27 ...

==> Now doing page 28 ...
=>starting on page 28 : ChartExtraction_Output/T.Rowe Price_2020
portrait


==> Finished Page 28 ...

==> Now doing page 29 ...
=>starting on page 29 : ChartExtraction_Output/T.Rowe Price_2020
portrait
==> Finished Page 29 ...

==> Now doing page 34 ...
=>starting on page 34 : ChartExtraction_Output/T.Rowe Price_2020
portrait
==> Finished Page 34 ...

==> Now doing page 41 ...
=>starting on page 41 : ChartExtraction_Output/T.Rowe Price_2020
portrait
==> Finished Page 41 ...

++++++++++ Report done ++++++++++


==> Convert page 7 of pdf to image...
==> Convert page 9 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert page 25 of pdf to image...
==> Convert page 30 of pdf to image...
==> Convert page 34 of pdf to image...
==> Convert page 42 of pdf to image...
==> Convert page 69 of pdf to image...
==> Convert page 77 of pdf to image...
==> Convert page 89 of pdf to image...
==> Convert page 90 of pdf to image...
==> Convert page 91 of pdf to image...
==> Convert page 92 of pdf to image...
==> Convert page 93 of pdf to image...
==> Convert page 134 of pdf to image...
==> Convert page 138 of pdf to image...
==> Convert page 420 of pdf to image...
==> Convert page 423 of pdf to image...
==> Convert page 424 of pdf to image...
==> Convert page 426 of pdf to image...
==> Convert page 432 of pdf to image...
==> Convert page 433 of pdf to image...
==> Convert page 434 of pdf to image...
==> Convert page 435 of pdf to image...
==> Convert page 436 of pdf to image...
==> Convert page

==> Finished Page 77 ...

==> Now doing page 89 ...
=>starting on page 89 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 89 ...

==> Now doing page 90 ...
=>starting on page 90 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 90 ...

==> Now doing page 91 ...
=>starting on page 91 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 91 ...

==> Now doing page 92 ...
=>starting on page 92 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 92 ...

==> Now doing page 93 ...
=>starting on page 93 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 93 ...

==> Now doing page 134 ...
=>starting on page 134 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 134 ...

==> Now doing page 138 ...
=>starting on page 138 : ChartExtraction_Output/Aegon N.V._2020
portrait
==> Finished Page 138 ...

==> Now doing page 420 ...
=>starting on page 420 : ChartExtraction_Output/Aegon N.V._2020
portrait


==> Finished Page 15 ...

==> Now doing page 17 ...
=>starting on page 17 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait
==> Finished Page 17 ...

==> Now doing page 18 ...
=>starting on page 18 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait


==> Finished Page 18 ...

==> Now doing page 19 ...
=>starting on page 19 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait
==> Finished Page 19 ...

==> Now doing page 20 ...
=>starting on page 20 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait
==> Finished Page 20 ...

==> Now doing page 21 ...
=>starting on page 21 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait


==> Finished Page 21 ...

==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait
==> Finished Page 22 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/Vontobel Asset Management_2017
portrait
==> Finished Page 25 ...

++++++++++ Report done ++++++++++


==> Convert page 30 of pdf to image...
==> Convert page 31 of pdf to image...
==> Convert page 32 of pdf to image...
==> Convert page 33 of pdf to image...
==> Convert page 50 of pdf to image...
==> Now doing page 30 ...
=>starting on page 30 : ChartExtraction_Output/Eastspring Investments_2020
portrait
==> Finished Page 30 ...

==> Now doing page 31 ...
=>starting on page 31 : ChartExtraction_Output/Eastspring Investments_2020
portrait
==> Finished Page 31 ...

==> Now doing page 32 ...
=>starting on page 32 : ChartExtraction_Output/Eastspring Investments_2020
portrait
==> Finished Page 32 ...

==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/Eastspring Investments_2020
portrait
==> Finished Page 33 ...

==> Now doing page 50 ...
=>starting on page 50 : ChartExtraction_Output/Eastspring Investments_2020
portrait
==> Finished Page 50 ...

++++++++++ Report done ++++++++++
==> Convert page 6 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert pa

==> Finished Page 24 ...

++++++++++ Report done ++++++++++
++++++++++ Report done ++++++++++
==> Convert page 4 of pdf to image...
==> Convert page 22 of pdf to image...
==> Convert page 23 of pdf to image...
==> Convert page 24 of pdf to image...
==> Convert page 25 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 42 of pdf to image...
==> Convert page 66 of pdf to image...
==> Convert page 73 of pdf to image...
==> Convert page 75 of pdf to image...
==> Now doing page 4 ...
=>starting on page 4 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 4 ...

==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/Nomura Asset Management_2020
portrait


==> Finished Page 22 ...

==> Now doing page 23 ...
=>starting on page 23 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 23 ...

==> Now doing page 24 ...
=>starting on page 24 : ChartExtraction_Output/Nomura Asset Management_2020
portrait


==> Finished Page 24 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 25 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 27 ...

==> Now doing page 42 ...
=>starting on page 42 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 42 ...

==> Now doing page 66 ...
=>starting on page 66 : ChartExtraction_Output/Nomura Asset Management_2020
portrait


==> Finished Page 66 ...

==> Now doing page 73 ...
=>starting on page 73 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 73 ...

==> Now doing page 75 ...
=>starting on page 75 : ChartExtraction_Output/Nomura Asset Management_2020
portrait
==> Finished Page 75 ...

++++++++++ Report done ++++++++++
==> Convert page 59 of pdf to image...
==> Convert page 64 of pdf to image...
==> Convert page 71 of pdf to image...
==> Convert page 72 of pdf to image...
==> Convert page 92 of pdf to image...
==> Convert page 93 of pdf to image...
==> Convert page 100 of pdf to image...
==> Convert page 101 of pdf to image...
==> Now doing page 59 ...
=>starting on page 59 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 59 ...

==> Now doing page 64 ...
=>starting on page 64 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait


==> Finished Page 64 ...

==> Now doing page 71 ...
=>starting on page 71 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 71 ...

==> Now doing page 72 ...
=>starting on page 72 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 72 ...

==> Now doing page 92 ...
=>starting on page 92 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait


==> Finished Page 92 ...

==> Now doing page 93 ...
=>starting on page 93 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 93 ...

==> Now doing page 100 ...
=>starting on page 100 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 100 ...

==> Now doing page 101 ...
=>starting on page 101 : ChartExtraction_Output/Bank of New York Mellon_2020
portrait
==> Finished Page 101 ...

++++++++++ Report done ++++++++++
==> Convert page 5 of pdf to image...
==> Convert page 14 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert page 21 of pdf to image...
==> Convert page 22 of pdf to image...
==> Convert page 23 of pdf to image...
==> Convert page 24 of pdf to image...
==> Convert page 25 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 28 of pdf to image...
==> Convert page 34 of pdf to image...
==> Convert page 35 of pdf to image...
==> Convert page 36 of pdf to image...
==> Convert page 

==> Finished Page 15 ...

==> Now doing page 21 ...
=>starting on page 21 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 21 ...

==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 22 ...

==> Now doing page 23 ...
=>starting on page 23 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 23 ...

==> Now doing page 24 ...
=>starting on page 24 : ChartExtraction_Output/Allianz Global Investors_2020
landscape


==> Finished Page 24 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 25 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 27 ...

==> Now doing page 28 ...
=>starting on page 28 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 28 ...

==> Now doing page 34 ...
=>starting on page 34 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 34 ...

==> Now doing page 35 ...
=>starting on page 35 : ChartExtraction_Output/Allianz Global Investors_2020
landscape


==> Finished Page 35 ...

==> Now doing page 36 ...
=>starting on page 36 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 36 ...

==> Now doing page 38 ...
=>starting on page 38 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 38 ...

==> Now doing page 39 ...
=>starting on page 39 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 39 ...

==> Now doing page 42 ...
=>starting on page 42 : ChartExtraction_Output/Allianz Global Investors_2020
landscape


==> Finished Page 42 ...

==> Now doing page 43 ...
=>starting on page 43 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 43 ...

==> Now doing page 44 ...
=>starting on page 44 : ChartExtraction_Output/Allianz Global Investors_2020
landscape
==> Finished Page 44 ...

++++++++++ Report done ++++++++++


==> Convert page 22 of pdf to image...
==> Convert page 23 of pdf to image...
==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/Ashmore Group_2020
portrait
==> Finished Page 22 ...

==> Now doing page 23 ...
=>starting on page 23 : ChartExtraction_Output/Ashmore Group_2020
portrait
==> Finished Page 23 ...

++++++++++ Report done ++++++++++
==> Convert page 8 of pdf to image...
==> Convert page 12 of pdf to image...
==> Convert page 13 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert page 17 of pdf to image...
==> Convert page 25 of pdf to image...
==> Convert page 35 of pdf to image...
==> Convert page 53 of pdf to image...
==> Convert page 54 of pdf to image...
==> Now doing page 8 ...
=>starting on page 8 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 8 ...

==> Now doing page 12 ...
=>starting on page 12 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished 

==> Finished Page 13 ...

==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 15 ...

==> Now doing page 17 ...
=>starting on page 17 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 17 ...

==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 25 ...

==> Now doing page 35 ...
=>starting on page 35 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 35 ...

==> Now doing page 53 ...
=>starting on page 53 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait


==> Finished Page 53 ...

==> Now doing page 54 ...
=>starting on page 54 : ChartExtraction_Output/Morgan Stanley Investment Management_2020
portrait
==> Finished Page 54 ...

++++++++++ Report done ++++++++++
==> Convert page 13 of pdf to image...
==> Convert page 15 of pdf to image...
==> Convert page 20 of pdf to image...
==> Convert page 35 of pdf to image...
==> Convert page 37 of pdf to image...
==> Convert page 38 of pdf to image...
==> Convert page 39 of pdf to image...
==> Now doing page 13 ...
=>starting on page 13 : ChartExtraction_Output/PIMCO_2019
portrait
==> Finished Page 13 ...

==> Now doing page 15 ...
=>starting on page 15 : ChartExtraction_Output/PIMCO_2019
portrait


==> Finished Page 15 ...

==> Now doing page 20 ...
=>starting on page 20 : ChartExtraction_Output/PIMCO_2019
portrait
==> Finished Page 20 ...

==> Now doing page 35 ...
=>starting on page 35 : ChartExtraction_Output/PIMCO_2019
portrait
==> Finished Page 35 ...

==> Now doing page 37 ...
=>starting on page 37 : ChartExtraction_Output/PIMCO_2019
portrait
==> Finished Page 37 ...

==> Now doing page 38 ...
=>starting on page 38 : ChartExtraction_Output/PIMCO_2019
portrait


==> Finished Page 38 ...

==> Now doing page 39 ...
=>starting on page 39 : ChartExtraction_Output/PIMCO_2019
portrait


==> Finished Page 39 ...

++++++++++ Report done ++++++++++
==> Convert page 25 of pdf to image...
==> Now doing page 25 ...
=>starting on page 25 : ChartExtraction_Output/Vanguard Group_2020
portrait
==> Finished Page 25 ...

++++++++++ Report done ++++++++++
ERROR at chart_extraction: 
Unable to get page count.
Syntax Error (2): Illegal character <21> in hex string
Syntax Error (4): Illegal character <4f> in hex string
Syntax Error (6): Illegal character <54> in hex string
Syntax Error (7): Illegal character <59> in hex string
Syntax Error (8): Illegal character <50> in hex string
Syntax Error (11): Illegal character <48> in hex string
Syntax Error (12): Illegal character <54> in hex string
Syntax Error (13): Illegal character <4d> in hex string
Syntax Error (14): Illegal character <4c> in hex string
Syntax Error (16): Illegal character <50> in hex string
Syntax Error (17): Illegal character <55> in hex string
Syntax Error (19): Illegal character <4c> in hex string
Syntax Error (20):

ERROR at chart_extraction: 
Unable to get page count.
Syntax Error (2): Illegal character <21> in hex string
Syntax Error (4): Illegal character <4f> in hex string
Syntax Error (6): Illegal character <54> in hex string
Syntax Error (7): Illegal character <59> in hex string
Syntax Error (8): Illegal character <50> in hex string
Syntax Error (11): Illegal character <68> in hex string
Syntax Error (12): Illegal character <74> in hex string
Syntax Error (13): Illegal character <6d> in hex string
Syntax Error (14): Illegal character <6c> in hex string
Syntax Error (17): Illegal character <68> in hex string
Syntax Error (18): Illegal character <74> in hex string
Syntax Error (19): Illegal character <6d> in hex string
Syntax Error (20): Illegal character <6c> in hex string
Syntax Error (22): Illegal character <6c> in hex string
Syntax Error (24): Illegal character <6e> in hex string
Syntax Error (25): Illegal character <67> in hex string
Syntax Error (26): Illegal character <3d> in hex string

==> Finished Page 8 ...

==> Now doing page 9 ...
=>starting on page 9 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 9 ...

==> Now doing page 10 ...
=>starting on page 10 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 10 ...

==> Now doing page 40 ...
=>starting on page 40 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 40 ...

==> Now doing page 47 ...
=>starting on page 47 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 47 ...

==> Now doing page 48 ...
=>starting on page 48 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 48 ...

==> Now doing page 49 ...
=>starting on page 49 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 49 ...

==> Now doing page 110 ...
=>starting on page 110 : ChartExtraction_Output/CMB Wealth Management_2020
portrait


==> Finished Page 110 ...

==> Now doing page 116 ...
=>starting on page 116 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 116 ...

==> Now doing page 117 ...
=>starting on page 117 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 117 ...

==> Now doing page 118 ...
=>starting on page 118 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 118 ...

==> Now doing page 126 ...
=>starting on page 126 : ChartExtraction_Output/CMB Wealth Management_2020
portrait
==> Finished Page 126 ...

++++++++++ Report done ++++++++++
==> Convert page 5 of pdf to image...
==> Convert page 18 of pdf to image...
==> Convert page 28 of pdf to image...
==> Convert page 30 of pdf to image...
==> Convert page 31 of pdf to image...
==> Now doing page 5 ...
=>starting on page 5 : ChartExtraction_Output/Aegon N.V._2017
landscape
==> Finished Page 5 ...

==> Now doing page 18 ...
=>starting on page 18 : ChartExtraction_Outpu

==> Finished Page 18 ...

==> Now doing page 28 ...
=>starting on page 28 : ChartExtraction_Output/Aegon N.V._2017
landscape
==> Finished Page 28 ...

==> Now doing page 30 ...
=>starting on page 30 : ChartExtraction_Output/Aegon N.V._2017
landscape
==> Finished Page 30 ...

==> Now doing page 31 ...
=>starting on page 31 : ChartExtraction_Output/Aegon N.V._2017
landscape
==> Finished Page 31 ...

++++++++++ Report done ++++++++++
==> Convert page 4 of pdf to image...
==> Convert page 6 of pdf to image...
==> Convert page 8 of pdf to image...
==> Convert page 20 of pdf to image...
==> Convert page 22 of pdf to image...
==> Convert page 24 of pdf to image...
==> Convert page 27 of pdf to image...
==> Convert page 33 of pdf to image...
==> Convert page 41 of pdf to image...
==> Convert page 47 of pdf to image...
==> Convert page 48 of pdf to image...
==> Convert page 49 of pdf to image...
==> Convert page 54 of pdf to image...
==> Convert page 57 of pdf to image...
==> Convert page 66 of

landscape
==> Finished Page 22 ...

==> Now doing page 24 ...
=>starting on page 24 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 24 ...

==> Now doing page 27 ...
=>starting on page 27 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 27 ...

==> Now doing page 33 ...
=>starting on page 33 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 33 ...

==> Now doing page 41 ...
=>starting on page 41 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 41 ...

==> Now doing page 47 ...
=>starting on page 47 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 47 ...

==> Now doing page 48 ...
=>starting on page 48 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 48 ...

==> Now doing page 49 ...
=>starting on page 49 : ChartExtraction_Output/BOCHK Asset Management_2019
landscape
==> Finished Page 49 ...

==> N

==> Finished Page 20 ...

==> Now doing page 21 ...
=>starting on page 21 : ChartExtraction_Output/Vontobel Asset Management_2016
portrait
==> Finished Page 21 ...

==> Now doing page 22 ...
=>starting on page 22 : ChartExtraction_Output/Vontobel Asset Management_2016
portrait
==> Finished Page 22 ...

==> Now doing page 23 ...
=>starting on page 23 : ChartExtraction_Output/Vontobel Asset Management_2016
portrait


==> Finished Page 23 ...

++++++++++ Report done ++++++++++
++++++++++ Report done ++++++++++


In [84]:
import shutil # high-level folder management library
def rm_processing_folders(out_folder):
    total_ROI = 0
    if not os.path.exists(out_folder):
        print("===> Folder do not exist")
    else:
        hidden_file = out_folder+"/.DS_Store"
        if os.path.exists(hidden_file):
            os.remove(hidden_file)
        
        removed_folders = []
        for filename in os.listdir(out_folder):
            print(f"[Cleaning {filename}]")   
            img_path = os.path.join(out_folder, filename)
            removed_folders = []
            for sub_filename in os.listdir(img_path):
                if sub_filename.endswith("filter_1") or sub_filename.endswith("filter_2") or sub_filename.endswith("pages"):
                    delete_path = os.path.join(img_path, sub_filename)
                    # print(f" ====>{delete_path}")
                    removed_folders.append(delete_path)
                    shutil.rmtree(delete_path)
            
            # Calculate total ROI
            roi_count = len(os.listdir(os.path.join(out_folder, filename)))
            total_ROI = total_ROI + roi_count
            
            print(f"=> ROI count: {roi_count} ")
            print(f"=> Succesfuly removed: {removed_folders}\n") 
            
    return total_ROI

# Delete processing folders
# out_folder = "ChartExtraction_Output" 
try:
    ROI_count = rm_processing_folders(out_folder)
    print(f"[Summary Report]\n Time Completed: {datetime.datetime.now}  \nTotal ROI: {ROI_count}")
except Exception as e: 
    print(f"Fail to remove: {e}" )

[Cleaning Vanguard Group_2020]
=> ROI count: 0 
=> Succesfuly removed: ['ChartExtraction_Output/Vanguard Group_2020/filter_2', 'ChartExtraction_Output/Vanguard Group_2020/filter_1', 'ChartExtraction_Output/Vanguard Group_2020/pages']

[Cleaning Northern Trust Asset Management_2020]
=> ROI count: 0 
=> Succesfuly removed: ['ChartExtraction_Output/Northern Trust Asset Management_2020/filter_2', 'ChartExtraction_Output/Northern Trust Asset Management_2020/filter_1', 'ChartExtraction_Output/Northern Trust Asset Management_2020/pages']

[Cleaning Fullerton Fund Management_2021]
=> ROI count: 0 
=> Succesfuly removed: []

[Cleaning Aberdeen Standard Investments_2020]
=> ROI count: 0 
=> Succesfuly removed: []

[Cleaning Legal & General Investment Management_2020-2021]
=> ROI count: 0 
=> Succesfuly removed: []

[Cleaning Nikko Asset Management_2019]
=> ROI count: 0 
=> Succesfuly removed: ['ChartExtraction_Output/Nikko Asset Management_2019/filter_2', 'ChartExtraction_Output/Nikko Asset Mana

In [None]:
# The program ends when this code is runned
# Approx timing: 30-45 mins for all asian banks (40 reports) 
with open('chart_output.json', 'w') as f:
    json.dump(json_lst, f)