In [None]:
from pathlib import Path
from dotenv import load_dotenv
from multiprocessing import Pool
import time
from sqlalchemy import text, create_engine
import os
import pandas as pd
import json

%load_ext autoreload
%reload_ext autoreload
%autoreload 2

pdf_files_folder = Path("//luxor/data/branch/Environmental Baseline Data\Version 4 - Final/PDF")
csv_tables_folder = Path("//luxor/data/branch/Environmental Baseline Data\Version 4 - Final/all_csvs")

if not pdf_files_folder.exists():
    print(pdf_files_folder, "does not exist!")
elif not csv_tables_folder.exists():
    print(csv_tables_folder, "does not exist!")
else:
    print("All paths are accessible.")


load_dotenv(override=True)
host = os.getenv("DB_HOST")
database = os.getenv("DB_DATABASE")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
engine_string = f"mysql+mysqldb://{user}:{password}@{host}/esa?charset=utf8"
engine = create_engine(engine_string)

In [None]:
with engine.connect() as conn:
    result = conn.execute("DELETE FROM esa.csvs;")
    print(f"Deleted {result.rowcount} csvs from DB")
csvs = list(csv_tables_folder.glob("*.csv"))
for f in csvs:
    f.unlink()
print(f"Deleted {len(csvs)} CSV files")

In [None]:
def create_args_for_csv_extraction():
    statement = text("SELECT * FROM esa.pdfs ORDER BY totalPages ASC;")
    with engine.connect() as conn:
        df = pd.read_sql(statement, conn)
    pdfs = df.to_dict("records")

    pages = []
    for pdf in pdfs:
        for page in range (1, pdf["totalPages"] + 1):
            pages.append((pdf["pdfId"], page, engine_string, str(pdf_files_folder), str(csv_tables_folder)))
    return pages

In [None]:
from external import extract_csv

with Path("log.txt").open("w", encoding="utf-8-sig") as f:
    def log_it(s):
        f.write(s)
        print(s)

    start_time = time.time()
    pages = create_args_for_csv_extraction()[:]
    log_it(f"Items to process: {len(pages)} at {time.ctime(start_time)}\n\n")

    with Pool() as pool:
        results = pool.map(extract_csv, pages)

    [log_it(result) for result in results]
    duration = round(time.time() - start_time)
    log_it(f"\nDone {len(pages)} items in {duration} seconds ({round(duration/60, 2)} min or {round(duration/3600, 2)} hours)")