In [1]:
# use tabula-py library to convert pdf to Excel - use keyword and get the line to the end of the page

In [2]:
## FAILED to write the content to an existing Excel file,
## so this version will try to store all the tables in a df variable and convert one time only to Excel

In [3]:
# MUST RUN !!!
import os
os.environ["JAVA_HOME"] = r"path\to\Java\jdk-21"

#### About parameters below
- 'keyword_pattern' - ways to set Regex:
    1. contain keyword - re.compile(re.escape("NET INTEREST INCOME"),re.IGNORECASE)
    2. contain keyword only - re.compile(r'^\s*\bNET INTEREST INCOME\b\s*$', re.IGNORECASE)
- 'check_nan' - if to check other columns are NaN - useful to check if the keyword is heading, example: (keyword_pattern=abc)
|column1|column2|column3|result|
|:---:|:---:|:---:|:---:|
|123|NaN|NaN|no|
|abc|NaN|NaN|yes|
|abc|NaN|123|no|
|NaN|abc|NaN|yes|

In [4]:
# function 1 - search with keyword and return page number, line number, and column number
import tabula
import re
import pandas as pd

def find_keyword_in_table(pdf_path, keyword_pattern, check_nan=True):
    # Define the area of the full page in points
    area = [0, 0, 1000, 1000]  # [top, left, bottom, right]
    
    table = tabula.read_pdf(input_path=pdf_path, pages='all', stream=True, encoding='utf-8', area=area)

    keyword_locations = []
    # tabula.read_pdf will return a list of tables, so "page_num" is the index for each table and df is each table
    for page_num, df in enumerate(table, start=1):
        # df.itertuples() - Iterate over DataFrame rows as namedtuples
        # 
        for row_index, row in enumerate(df.itertuples(), start=1):
            keyword_found = False
            for col_index, cell in enumerate(row[1:], start=1):  # Use enumerate to get col_index
                if re.search(keyword_pattern, str(cell)):
                    if check_nan and any(pd.notnull(row[j]) for j in range(1, len(row)) if j != col_index):
                        # If check_nan is True, check if any other column in the same row is not NaN
                        break  # Break out of inner loop and skip this row
                    keyword_locations.append((str(page_num), str(row_index), str(col_index)))
                    keyword_found = True
                    break  # Break out of inner loop if keyword is found in a column
            if keyword_found:
                break  # Break out of outer loop if keyword is found
    return keyword_locations

In [5]:
# function 2 - remove duplicated pages (if locations have same page, get the combination of lowest line number)

def unique_page(locations):
    # Dictionary to store the smallest second number for each unique first number
    smallest_second_numbers = {}
    
    # Iterate through the list of lists
    for sublist in locations:
        if not sublist:  # Check if sublist is empty
            continue  # Skip empty sublists
        number = sublist[0][0]
        second_number = sublist[0][1]
    
        if number in smallest_second_numbers:
            if second_number < smallest_second_numbers[number]:
                smallest_second_numbers[number] = second_number
        else:
            smallest_second_numbers[number] = second_number
    
    # Filter the list of lists based on the smallest second number for each unique first number
    # wrong example - filtered_data = [[(number, second, third)] for [number, second, third] in locations if second == smallest_second_numbers[number]]
    filtered_data = [[(number, second, third)] for sublist in locations for (number, second, third) in sublist if second == smallest_second_numbers.get(number)]
    
    # Sort the filtered data based on the first number
    filtered_data_sorted = sorted(filtered_data, key=lambda x: int(x[0][0]))

    return filtered_data_sorted

In [6]:
# function 3 - get the table and store to a df (from the keyword line to the end of the page) 
## 'location' parameter - get from the 1st function
import re
from openpyxl import load_workbook

def keyword_content(pdf_path, location, check_nan=True):

    for page, line, column in location:
        # Define the area of the full page in points
        area = [0, 0, 1000, 1000]  # [top, left, bottom, right]

        # Read the PDF and extract tables
        tables = tabula.read_pdf(input_path=pdf_path, pages=page, stream=True, encoding='utf-8', area=area)

        # extract only from the line until the end
        df = tables[0].loc[int(line)-1:]

    return df

In [7]:
# function 4 - convert the data frame to Excel
def df_to_excel(dfs, output_path):
    try:
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            # Add the DataFrame to a new sheet in the Excel file
            for i, table in enumerate(dfs):
                table.to_excel(writer, sheet_name=f"Table_{i+1}", index=False, header=False)
    except Exception as e:
        print(f"An error occurred: {e}")

In [8]:
# function 5 - combined all the functions above
def pdf_to_excel_by_keyword(heading_keywords, keywords, pdf_path=None, output_path=None, check_nan=True):
    if pdf_path is None or output_path is None:
        raise ValueError("Please provide data for 'pdf_path' or 'output_path'")

    if not heading_keywords and not keywords:
        raise ValueError("Heading keywords and Keywords list are empty")
    
    # to store all the pattern for heading_keyword and keyword
    all_patterns = []

    # lines that contains only keyword, dont have anything infront and after
    for heading_keyword in heading_keywords:
        heading_keyword_pattern = re.compile(rf'^\s*\b{heading_keyword}\b\s*$', re.IGNORECASE)
        all_patterns.append(heading_keyword_pattern)

    # lines that contains keyword anyway
    for keyword in keywords:
        keyword_pattern=re.compile(re.escape(keyword), re.IGNORECASE)
        all_patterns.append(keyword_pattern)
    
    locations = []
    for pattern in all_patterns:
        # function 1 - get the location of a keyword
        location = find_keyword_in_table(pdf_path=pdf_path, keyword_pattern=pattern, check_nan=check_nan)
        locations.append(location)
    
    # function 2 - remove duplicated page from the locations - remain the one with lowest line number
    locations_unique_page = unique_page(locations=locations)
    
    dfs = []
    for location in locations_unique_page:
        # function 3 - get the content of the keyword (from the keyword's line to the end of the page)
        df = keyword_content(pdf_path=pdf_path, location=location, check_nan=check_nan)
        dfs.append(df)
    
    # function 4 - convert the df above to Excel - each tabel to each sheet
    df_to_excel(dfs=dfs, output_path=output_path)

In [9]:
# lines that contains only keyword, dont have anything infront and after
heading_keywords = ["INCOME STATEMENTS",
                    "NET INTEREST INCOME",
                    "KEY INTEREST BEARING ASSETS AND LIABILITIES",
                    "STATEMENTS OF FINANCIAL POSITION"]

# lines that contains keyword anyway
keywords = ["Impaired loans, advances and financing by economic purpose",
            "Impaired loans, advances and financing by geographical distribution",
            "Loans, advances and financing analysed by type of customers",
            "Loans, advances and financing analysed by geographical locations",
            "Loans, advances and financing analysed by economic purpose",
            "Movements in impaired loans, advances and financing (“impaired loans”)",
            "The capital adequacy ratios of the Group and of the Bank",
            "The breakdown of RWA by each major risk categories for the Group and the Bank"]

import os

folder_path = "path\to\pdfs"
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
for pdf_file in pdf_files:
    year = pdf_file.split('AR')[-1].split('_')[0]
    pdf_path = os.path.join(folder_path, pdf_file)
    output_path = os.path.join(folder_path, f"Maybank-AR{year}-raw.xlsx")

    pdf_to_excel_by_keyword(heading_keywords=heading_keywords, keywords=keywords, check_nan=True, pdf_path=pdf_path, output_path=output_path)