In [None]:
# Elevation Certificate Parser with Region Preview Tool (Sections A–D + C2 Box Tuning)

import os
import re
import pytesseract
import pandas as pd
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
import cv2
import numpy as np
import matplotlib.pyplot as plt

# --- Preview tool for bounding box tuning ---
def preview_c2_regions(pdf_path, page_num=3):
    images = convert_from_path(pdf_path, dpi=300)
    image = images[page_num - 1].convert("RGB")
    draw = ImageDraw.Draw(image)

    boxes = {
        "C2a_Top of Bottom Floor": (180, 650, 250, 35),
        "C2b_Top of Next Higher Floor": (180, 690, 250, 35),
        "C2c_Bottom of Lowest Horizontal Structural Member": (180, 730, 250, 35),
        "C2d_Attached Garage (Top of Slab)": (180, 770, 250, 35),
        "C2e_Lowest Elevation of Machinery": (180, 810, 250, 35),
        "C2f_Lowest Adjacent Grade (LAG)": (180, 850, 250, 35),
        "C2g_Highest Adjacent Grade (HAG)": (180, 890, 250, 35),
        "C2h_Finished LAG at Lowest Deck/Stairs": (180, 930, 250, 35)
    }

    fig, axs = plt.subplots(len(boxes), 2, figsize=(10, 20))
    for idx, (label, (x, y, w, h)) in enumerate(boxes.items()):
        crop = image.crop((x, y, x + w, y + h)).convert("L")
        axs[idx, 0].imshow(crop, cmap="gray")
        axs[idx, 0].axis("off")
        axs[idx, 0].set_title(f"{label}")

        crop_np = np.array(crop)
        _, thresh = cv2.threshold(crop_np, 150, 255, cv2.THRESH_BINARY)
        text = pytesseract.image_to_string(thresh, config="--psm 7")
        axs[idx, 1].text(0.1, 0.5, text.strip(), fontsize=12)
        axs[idx, 1].axis("off")
        axs[idx, 1].set_title("OCR Output")

    plt.tight_layout()
    plt.show()

# --- Region-Based OCR for C2 fields ---
def extract_c2_fields_by_region(image):
    boxes = {
        "C2a_Top of Bottom Floor": (180, 650, 250, 35),
        "C2b_Top of Next Higher Floor": (180, 690, 250, 35),
        "C2c_Bottom of Lowest Horizontal Structural Member": (180, 730, 250, 35),
        "C2d_Attached Garage (Top of Slab)": (180, 770, 250, 35),
        "C2e_Lowest Elevation of Machinery": (180, 810, 250, 35),
        "C2f_Lowest Adjacent Grade (LAG)": (180, 850, 250, 35),
        "C2g_Highest Adjacent Grade (HAG)": (180, 890, 250, 35),
        "C2h_Finished LAG at Lowest Deck/Stairs": (180, 930, 250, 35)
    }
    results = {}
    for label, (x, y, w, h) in boxes.items():
        crop = image.crop((x, y, x + w, y + h)).convert('L')
        crop_np = np.array(crop)
        _, thresh = cv2.threshold(crop_np, 150, 255, cv2.THRESH_BINARY)
        text = pytesseract.image_to_string(thresh, config='--psm 7')
        match = re.search(r"(\d+\.\d{1,3})", text)
        results[label] = float(match.group(1)) if match else None
    return results

# --- OCR Parsers for A, B, D ---
def clean_text(text):
    return " ".join(text.split())

def extract_section_a(text):
    return {
        "A1_Building Owner": re.search(r"A1\.\s*Building Owner['’]s Name:?\s*(.*?)(?=A2\.|$)", text),
        "A2_Building Address": re.search(r"A2\.\s*Building.*?Address.*?:\s*(.*?)(?=City:|$)", text),
        "A3_Property Description": re.search(r"A3\.\s*Property Description.*?:\s*(.*?)(?=A4\.|$)", text),
        "A4_Building Use": re.search(r"A4\.\s*Building Use.*?:\s*(.*?)(?=A5\.|$)", text),
        "A5_Lat_Long": re.search(r"A5\.\s*Latitude/Longitude.*?:\s*(.*?)(?=A6\.|$)", text)
    }

def extract_section_b(text):
    return {
        "B1_Community Name": re.search(r"B1\.a\.\s*NFIP Community Name:?\s*(.*?)(?=B1\.b\.|$)", text),
        "B2_County": re.search(r"B2\.\s*County Name:?\s*(.*?)(?=B3\.|$)", text),
        "B4_Map Panel No": re.search(r"B4\.\s*Map/Panel No\.:?\s*(.*?)(?=B5\.|$)", text),
        "B8_Flood Zones": re.search(r"B8\.\s*Flood Zone\(s\):?\s*(.*?)(?=B9\.|$)", text),
        "B9_BFE": re.search(r"B9\.\s*Base Flood Elevation.*?:?\s*(.*?)(?=B10\.|$)", text)
    }

def extract_section_d(text):
    return {
        "D1_Certifier Name": re.search(r"Certifier['’]s Name:?\s*(.*?)(?=License Number:|Title:|$)", text),
        "D2_License Number": re.search(r"License Number:?\s*(.*?)(?=Title:|$)", text),
        "D3_Company Name": re.search(r"Company Name:?\s*(.*?)(?=Address:|$)", text),
        "D4_Address": re.search(r"Address:?\s*(.*?)(?=City:|$)", text),
        "D5_City": re.search(r"City:?\s*(.*?)(?=State:|$)", text),
        "D6_State": re.search(r"State:?\s*([A-Z]{2})", text),
        "D7_Zip": re.search(r"ZIP Code:?\s*(\d{5})", text)
    }

# --- Main Parser ---
def parse_elevation_certificate(pdf_path):
    images = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=4)
    pil_images = [Image.fromarray(np.array(img)) for img in images]
    full_text = " ".join([pytesseract.image_to_string(img) for img in pil_images])
    text = clean_text(full_text)

    sections = {}
    sections.update({k: (v.group(1).strip() if isinstance(v, re.Match) else None) for k, v in extract_section_a(text).items()})
    sections.update({k: (v.group(1).strip() if isinstance(v, re.Match) else None) for k, v in extract_section_b(text).items()})
    sections.update(extract_c2_fields_by_region(pil_images[2]))  # usually Section C is on page 3
    sections.update({k: (v.group(1).strip() if isinstance(v, re.Match) else None) for k, v in extract_section_d(text).items()})
    sections["Source File"] = os.path.basename(pdf_path)

    return sections

# --- Batch Runner ---
def run_on_directory(directory):
    all_data = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(".pdf"):
            filepath = os.path.join(directory, filename)
            try:
                parsed = parse_elevation_certificate(filepath)
                all_data.append(parsed)
            except Exception as e:
                all_data.append({"Source File": filename, "Error": str(e)})
    return pd.DataFrame(all_data)

# --- Single File Runner ---
def run_on_file(pdf_path):
    return pd.DataFrame([parse_elevation_certificate(pdf_path)])

# --- Export Utility ---
def export_to_excel_csv(df, base_filename="elevation_certificates"):
    df.to_excel(f"{base_filename}.xlsx", index=False)
    df.to_csv(f"{base_filename}.csv", index=False)
    print(f"Exported to {base_filename}.xlsx and {base_filename}.csv")
