In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import quote
import re
import camelot
import PyPDF2
from pathlib import Path
import shutil
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import text, create_engine
from wand.image import Image
from multiprocessing import Pool
import os
import pandas as pd
import shutil

prod_pdf_files_folder =  Path("//luxor/data/board/Dev/PCMR/pdf")
pdf_files_folder =       Path("//luxor/data/board/Dev/PCMR/original_pdf")
csv_tables_folder_path = Path("//luxor/data/board/Dev/PCMR/csv_tables")
jpg_tables_folder_path = Path("//luxor/data/board/Dev/PCMR/jpg_tables")
pdf_files = list(pdf_files_folder.glob("*.pdf"))

if not pdf_files_folder.exists():
    print(pdf_files_folder, "does not exist!")
if not jpg_tables_folder_path.exists():
    print(jpg_tables_folder_path, "does not exist!")
if not csv_tables_folder_path.exists():
    print(csv_tables_folder_path, "does not exist!")

load_dotenv()
host = os.getenv("DB_HOST")
database = os.getenv("DB_DATABASE")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
engine = create_engine(f"mysql+mysqldb://{user}:{password}@{host}/{database}?charset=utf8")

engine2 = create_engine(f"mssql+pyodbc://psql21cap/CS_Prod?driver=SQL+Server+Native+Client+11.0")

chrome_options = Options()
chrome_options.add_argument("--headless")
search_url = "https://apps.cer-rec.gc.ca/REGDOCS/Search?txthl="
search_params="&w=0&m=0&lf=2"

In [None]:
def copy_and_rename_pdfs():
    with engine.connect() as conn:
        statement = text("SELECT * FROM pdfs;")
        df = pd.read_sql(statement, conn)
        data = df.to_dict("records")
        for item in data:
            shutil.copy(pdf_files_folder.joinpath(str(item["pdfName"]) + ".pdf"),
            prod_pdf_files_folder.joinpath(str(item["pdfId"]) + ".pdf"))
copy_and_rename_pdfs()

In [None]:
def get_number_of_pages(pdf_path):
    with pdf_path.open("rb") as pdf:
        reader = PyPDF2.PdfFileReader(pdf)
        if reader.isEncrypted:
            reader.decrypt("")
        total_pages = reader.getNumPages()
        return total_pages

In [None]:
def get_pdf_metadata(pdf_path):
    statement = text("SELECT ParentID, DataID, CreateDate FROM [CS_Prod].[dbo].[DTreeCore] WHERE Name LIKE :file_name")
    with engine2.connect() as conn:
        df = pd.read_sql(statement, conn, params={"file_name": pdf_path.stem + "%"})
        return df.to_dict("records")[0]

In [None]:
def check_if_file_is_in_db_already(pdf_path):
    with engine.connect() as conn:
        statement = text("SELECT * FROM pdfs WHERE pdfName = :pdf_name;")
        result = conn.execute(statement, {"pdf_name": pdf_path.stem})
        return True if result.rowcount > 0 else False

In [None]:
# Function gets the metadata for a PDF from REGDOCS and adds it to the DB

def insert_pdf(pdf_path):
    try:
        if (check_if_file_is_in_db_already(pdf_path)):
            return

        metadata = get_pdf_metadata(pdf_path)
        metadata["pdf_name"] = pdf_path.stem
        metadata["pdf_size"] = int(pdf_path.stat().st_size / 1024 / 1024 * 100) / 100
        metadata["total_pages"] = get_number_of_pages(pdf_path)

        with engine.connect() as conn:
            statement = text("INSERT INTO pdfs (pdfId, pdfName, pdfSize, filingId, date, totalPages) " +
                             " VALUE (:DataID, :pdf_name, :pdf_size, :ParentID, :CreateDate, :total_pages);")
            result = conn.execute(statement, metadata)
        print(f"Successfully inserted {result.rowcount} rows for {pdf_path.stem}")
    except Exception as e:
        print(f"Error for {pdf.stem}: {e}")

In [None]:
# Function gets the metadata for all PDFs from REGDOCS and adds it to the DB

for pdf in pdf_files:
    insert_pdf(pdf)

print(f"Done processing {len(pdf_files)} PDF file")