In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from urllib.parse import quote
import re
import camelot
import PyPDF2
from pathlib import Path
import shutil
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import text, create_engine
from wand.image import Image
from multiprocessing import Pool
import os
import pandas as pd
import importlib
import time
import tika
from tika import parser

%reload_ext autoreload

pdf_files_folder = Path("//luxor/data/board/Dev/PCMR/pdf_files")
csv_tables_folder = Path("//luxor/data/board/Dev/PCMR/csv_tables")
jpg_tables_folder = Path("//luxor/data/board/Dev/PCMR/jpg_tables")
pdf_files = list(pdf_files_folder.glob("*.pdf"))

if not pdf_files_folder.exists():
    print(pdf_files_folder, "does not exist!")
elif not jpg_tables_folder.exists():
    print(jpg_tables_folder, "does not exist!")
elif not csv_tables_folder.exists():
    print(csv_tables_folder, "does not exist!")
else:
    print("All paths are accessible.")

load_dotenv()
host = os.getenv("DB_HOST")
database = os.getenv("DB_DATABASE")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
engine_string = f"mysql+mysqldb://{user}:{password}@{host}/{database}?charset=utf8mb4"
engine = create_engine(engine_string)
engine2_string = f"mssql+pyodbc://psql21cap/CS_Prod?driver=SQL+Server+Native+Client+11.0"
engine2 = create_engine(engine2_string)

In [None]:
############################################################################
# The following cells are for importing PDFs to the DB to commence capturing
############################################################################

In [None]:
# Function gets the metadata for all PDFs from REGDOCS and adds it to the DB
from ext_funcs import insert_pdf

args = [(pdf, engine_string, engine2_string) for pdf in pdf_files]
print(f"Items to process: {len(args)}")
start_time = time.time()

# Sequential mode
# for arg in args[:]:
#     result = insert_pdf(arg)
#     print(result[:-1])

# Multiprocessing mode
with Pool() as pool:
    results = pool.map(insert_pdf, args, chunksize=1)
for result in results:
    print(result, end='', flush=True)

duration = round(time.time() - start_time)
print(
    f"Done {len(args)} in {duration} seconds ({round(duration/60, 2)} min or {round(duration/3600, 2)} hours)"
)

In [None]:
#############################################################################
# The following cells are for processing the tables after capturing is done #
#############################################################################

In [None]:
# CAREFUL! DELETES **ALL** THE TABLES!!!
# with engine.connect() as conn:
#     result = conn.execute("DELETE FROM tables;")
#     print(f"Deleted {result.rowcount} tables from DB")

# csvs = list(csv_tables_folder.glob("*.csv"))
# for f in csvs:
#     f.unlink()
# print(f"Deleted {len(csvs)} CSV files")

# jpgs = list(jpg_tables_folder.glob("*.jpg"))
# for f in jpgs:
#     f.unlink()
# print(f"Deleted {len(jpgs)} JPG files")

In [None]:
# CAREFUL! DELETES **ALL** THE CSVS AND IMAGES, and resets the CORRECT_CSV fields!!!
with engine.connect() as conn:
    result = conn.execute("DELETE FROM csvs;")
    print(f"Deleted {result.rowcount} csvs from DB")
    result = conn.execute(
        "UPDATE tables SET csvsExtracted = NULL WHERE csvsExtracted IS NOT NULL;"
    )
    print(f"Reset {result.rowcount} tables (csvsExtracted) from DB")
    csvs = list(csv_tables_folder.glob("*.csv"))
    for f in csvs:
        f.unlink()
    print(f"Deleted {len(csvs)} CSV files")

    result = conn.execute(
        "UPDATE tables SET correct_csv = NULL WHERE correct_csv IS NOT NULL;")
    print(f"Reset {result.rowcount} tables (correct_csv) from DB")

    result = conn.execute(
        "UPDATE tables SET imageExtracted = NULL WHERE imageExtracted IS NOT NULL;"
    )
    print(f"Reset {result.rowcount} tables (imageExtracted) from DB")
    csvs = list(jpg_tables_folder.glob("*.jpg"))
    for f in csvs:
        f.unlink()
    print(f"Deleted {len(csvs)} JPG files")

In [15]:
def populate_coordinates(table):
    try:
        with engine.connect() as conn:
            pdf = pdf_files_folder.joinpath(f"{table.pdfName}.pdf").resolve()
            with Image(filename=f"{pdf}[{table.page - 1}]") as i:
                pdf_width = i.width
                pdf_height = i.height

            x1 = int(table.x1 * pdf_width / table.pageWidth)
            x2 = int(table.x2 * pdf_width / table.pageWidth)
            y1 = int(table.y1 * pdf_height / table.pageHeight)
            y2 = int(table.y2 * pdf_height / table.pageHeight)

            query = (
                f"UPDATE tables SET pdfWidth={pdf_width}, pdfHeight={pdf_height}, pdfX1={x1},"
                +
                f"pdfX2={x2}, pdfY1={y1}, pdfY2={y2} WHERE tableId='{table.tableId}';"
            )
            conn.execute(query)
    except Exception as e:
        print(f"Error for {table.pdfName} - page {table.page}: {e}")

In [16]:
start_time = time.time()
statement = text("SELECT * FROM tables WHERE pdfX1 IS NULL;")
with engine.connect() as conn:
    df = pd.read_sql(statement, conn)
tables = list(df.itertuples())
print(f"Working on {len(tables)} items:")

for table in tables:
    populate_coordinates(table)

duration = round(time.time() - start_time)
print(f"Done {len(tables)} in {duration} seconds ({round(duration/60, 2)} min or {round(duration/3600, 2)} hours)")

Working on 4 items:
Done 4 in 2 seconds (0.03 min or 0.0 hours)


In [17]:
def create_args_for_image_extraction():
    statement = text(
        "SELECT * FROM tables WHERE imageExtracted IS NULL AND pdfX1 IS NOT NULL;"
    )
    with engine.connect() as conn:
        df = pd.read_sql(statement, conn)
        tables = df.to_dict("records")

    args = [(table, engine_string, str(pdf_files_folder),
             str(jpg_tables_folder)) for table in tables]
    return args

In [18]:
from ext_funcs import extract_image

args = create_args_for_image_extraction()
print(f"Items to process: {len(args)}")
start_time = time.time()

# Sequential mode
# results = [ext_funcs.extract_image(arg) for arg in args]

# Multiprocessing mode
with Pool() as pool:
    results = pool.map(extract_image, args, chunksize=1)

for result in results:
    print(result, end='', flush=True)

duration = round(time.time() - start_time)
print(
    f"Done {len(args)} in {duration} seconds ({round(duration/60, 2)} min or {round(duration/3600, 2)} hours)"
)

Items to process: 4
Done 4 in 6 seconds (0.1 min or 0.0 hours)


In [None]:
def create_args_for_csv_extraction():
    statement = text(
        "SELECT * FROM tables WHERE pdfX1 IS NOT NULL AND csvsExtracted IS NULL;"
    )
    with engine.connect() as conn:
        df = pd.read_sql(statement, conn)
        tables = df.to_dict("records")

    args = [(table, engine_string, str(pdf_files_folder),
             str(csv_tables_folder)) for table in tables]
    return args

In [19]:
from ext_funcs import extract_csv

args = create_args_for_csv_extraction()
print(f"Items to process: {len(args)}")
start_time = time.time()

# Sequential mode
# results = [extract_csv(arg) for arg in args]

# Multiprocessing mode
with Pool() as pool:
    results = pool.map(extract_csv, args, chunksize=1)

for result in results:
    print(result, end='', flush=True)

duration = round(time.time() - start_time)
print(
    f"Done {len(args)} in {duration} seconds ({round(duration/60, 2)} min or {round(duration/3600, 2)} hours)"
)

Items to process: 4
Done 4 in 7 seconds (0.12 min or 0.0 hours)
