In [None]:
import json
import glob
from pathlib import Path

# Directory containing BI HTML JSON files
bi_dir = Path("output_wa_pdf_proxy/bi_html")
json_files = glob.glob(str(bi_dir / "wa_bi_*.json"))

total_rows = 0
distinct_names = set()
distinct_ids = set()
ids_with_email = set()

def has_email_in_pdfs(biz: dict) -> bool:
    """
    Check if this business has any non-empty email in PDFSummaries.
    """
    pdf_summaries = biz.get("PDFSummaries", []) or []
    for pdf in pdf_summaries:
        # Handle 'email' / 'Email' variants, just in case
        email = pdf.get("email") or pdf.get("Email")
        if email and str(email).strip():
            return True
    return False


for file_path in json_files:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # In BI HTML files, the root is usually a list of business dicts
        if isinstance(data, list):
            records = data
        elif isinstance(data, dict):
            # Fallbacks if ever wrapped in a dict in some files
            records = (
                data.get("businesses")
                or data.get("BusinessList")
                or data.get("business_list")
                or []
            )
        else:
            records = []

        for biz in records:
            total_rows += 1

            biz_id = biz.get("BusinessID")
            biz_name = biz.get("BusinessName")

            if biz_name:
                distinct_names.add(str(biz_name).strip())

            if biz_id is not None:
                distinct_ids.add(biz_id)

                if has_email_in_pdfs(biz):
                    ids_with_email.add(biz_id)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("======== BI_HTML SUMMARY RESULTS ========")
print(f"Total Rows: {total_rows}")
print(f"Distinct Business Names: {len(distinct_names)}")
print(f"Distinct Business IDs: {len(distinct_ids)}")
print(f"Business IDs With Any Email (from PDFSummaries): {len(ids_with_email)}")


Total Rows: 17
Distinct Business Names: 17
Distinct Business IDs: 17
Business IDs With Any Email (from PDFSummaries): 9


In [3]:
import json
import glob
from pathlib import Path

# Directory containing API JSON files
api_dir = Path("output_wa_pdf_proxy/api")
json_files = glob.glob(str(api_dir / "wa_api_*.json"))

total_rows = 0
distinct_names = set()
distinct_ids = set()
ids_with_email = set()


def extract_email_candidates(biz):
    """Extract all possible email fields from WA API business object."""
    return [
        (biz.get("Agent") or {}).get("EmailAddress"),
        (biz.get("OnlineReportAgent") or {}).get("EmailAddress"),
        (biz.get("PrincipalOffice") or {}).get("EmailAddress"),
        (biz.get("CorrespondenceAddress") or {}).get("CorrespondenceEmailAddress"),
        biz.get("EmailAddress"),  # fallback if present
    ]


for file_path in json_files:
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # WA API uses pages â†’ business_list
        for page in data.get("pages", []):
            business_list = page.get("business_list", [])

            for biz in business_list:
                total_rows += 1

                biz_id = biz.get("BusinessID")
                biz_name = biz.get("BusinessName")

                if biz_name:
                    distinct_names.add(biz_name.strip())

                if biz_id:
                    distinct_ids.add(biz_id)

                    # Check if any email exists and is non-empty
                    emails = extract_email_candidates(biz)
                    if any(e and e.strip() for e in emails if isinstance(e, str) or e):
                        ids_with_email.add(biz_id)

    except Exception as e:
        print(f"Error processing {file_path}: {e}")


print("======== SUMMARY RESULTS ========")
print(f"Total Rows: {total_rows}")
print(f"Distinct Business Names: {len(distinct_names)}")
print(f"Distinct Business IDs: {len(distinct_ids)}")
print(f"Business IDs With Any Email: {len(ids_with_email)}")


Total Rows: 35
Distinct Business Names: 35
Distinct Business IDs: 35
Business IDs With Any Email: 0
