In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import quote
import re
import camelot
import PyPDF2
from pathlib import Path
import shutil
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import text, create_engine
from wand.image import Image
from multiprocessing import Pool
import os
import pandas as pd
import shutil

pdf_files_folder =       Path("//luxor/data/board/Dev/PCMR/pdf_files")
csv_tables_folder = Path("//luxor/data/board/Dev/PCMR/csv_tables")
jpg_tables_folder = Path("//luxor/data/board/Dev/PCMR/jpg_tables")
pdf_files = list(pdf_files_folder.glob("*.pdf"))

if not pdf_files_folder.exists():
    print(pdf_files_folder, "does not exist!")
elif not jpg_tables_folder.exists():
    print(jpg_tables_folder, "does not exist!")
elif not csv_tables_folder.exists():
    print(csv_tables_folder, "does not exist!")
else:
    print("All paths are accessible.")

load_dotenv()
host = os.getenv("DB_HOST")
database = os.getenv("DB_DATABASE")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
engine = create_engine(f"mysql+mysqldb://{user}:{password}@{host}/{database}?charset=utf8")

engine2 = create_engine(f"mssql+pyodbc://psql21cap/CS_Prod?driver=SQL+Server+Native+Client+11.0")

chrome_options = Options()
chrome_options.add_argument("--headless")
search_url = "https://apps.cer-rec.gc.ca/REGDOCS/Search?txthl="
search_params="&w=0&m=0&lf=2"

In [None]:
def get_number_of_pages(pdf_path):
    with pdf_path.open("rb") as pdf:
        reader = PyPDF2.PdfFileReader(pdf)
        if reader.isEncrypted:
            reader.decrypt("")
        total_pages = reader.getNumPages()
        return total_pages

def check_if_file_is_in_db_already(pdf_path):
    with engine.connect() as conn:
        statement = text("SELECT * FROM pdfs WHERE pdfName = :pdf_name;")
        result = conn.execute(statement, {"pdf_name": pdf_path.stem})
        return True if result.rowcount > 0 else False

        # Function gets the metadata for a PDF from REGDOCS and adds it to the DB

def insert_pdf(pdf_path):
    try:
        if (check_if_file_is_in_db_already(pdf_path)):
            return

        metadata = get_pdf_metadata(pdf_path)
        metadata["pdf_name"] = pdf_path.stem
        metadata["pdf_size"] = int(pdf_path.stat().st_size / 1024 / 1024 * 100) / 100
        metadata["total_pages"] = get_number_of_pages(pdf_path)

        with engine.connect() as conn:
            statement = text("INSERT INTO pdfs (pdfId, pdfName, pdfSize, filingId, date, totalPages) " +
                             "VALUE (:DataID,:pdf_name,:pdf_size,:ParentID,:CreateDate,:total_pages);")
            result = conn.execute(statement, metadata)
        print(f"Successfully inserted {result.rowcount} rows for {pdf_path.stem}")
    except Exception as e:
        print(f"Error for {pdf.stem}: {e}")

def get_pdf_metadata(pdf_path):
    statement = text("SELECT ParentID, DataID, CreateDate FROM [CS_Prod].[dbo].[DTreeCore] " +
                     "WHERE Name LIKE :file_name")
    with engine2.connect() as conn:
        df = pd.read_sql(statement, conn, params={"file_name": pdf_path.stem + "%"})
    return df.to_dict("records")[0]

In [None]:
# Function gets the metadata for all PDFs from REGDOCS and adds it to the DB
print(f"Working on {len(pdf_files)} items:")

for pdf in pdf_files:
    insert_pdf(pdf)

print(f"Done processing {len(pdf_files)} PDF files")

In [None]:
#############################################################################
# The following cells are for processing the tables after capturing is done #
#############################################################################

def populate_coordinates(table):
    try:
        with engine.connect() as conn:
            pdf = pdf_files_folder.joinpath(f"{table.pdfName}.pdf").resolve()
            with Image(filename=f"{pdf}[{table.page - 1}]") as i:
                pdf_width = i.width
                pdf_height = i.height

            x1 = int(table.x1 * pdf_width / table.pageWidth)
            x2 = int(table.x2 * pdf_width / table.pageWidth)
            y1 = int(table.y1 * pdf_height / table.pageHeight)
            y2 = int(table.y2 * pdf_height / table.pageHeight)

            query = (f"UPDATE tables SET pdfWidth={pdf_width}, pdfHeight={pdf_height}, pdfX1={x1}," +
                    f"pdfX2={x2}, pdfY1={y1}, pdfY2={y2} WHERE tableId='{table.tableId}';")
            conn.execute(query)
            print(f"Populated coordinates for table ID {table.tableId}")
    except Exception as e:
        print(f"Error for {table.pdfName} - page {table.page}: {e}")

In [None]:
statement = text("SELECT * FROM tables WHERE pdfX1 IS NULL;")
with engine.connect() as conn:
    df = pd.read_sql(statement, conn)
tables = list(df.itertuples())
print(f"Working on {len(tables)} items:")

results = [populate_coordinates(table) for table in tables]
print("Done")

In [None]:
def create_args_for_image_extraction():
  statement = text("SELECT * FROM tables WHERE imageExtracted IS NULL;")
  with engine.connect() as conn:
      df = pd.read_sql(statement, conn)
      tables = df.to_dict("records")

  args = [(table, engine, pdf_files_folder, jpg_tables_folder) for table in tables]
  return args
# create_args_for_image_extracton()

In [None]:
import ext_funcs

args = create_args_for_image_extraction()

results = [ext_funcs.extract_image(arg) for arg in args]

for result in results:
    print(result)