# Validation and correction

As we've continued working with the scraped data versus the changes we've made from the initial csv files given to the LRC. We've done validation and corrections to the initial csv files that are not reflected in the scraped datasets. So now since we've uploaded our data into SQLite the next step will be to cross validate and update the SQLite. Which will be done below

In [1]:
## setup and imports
import os
import re
import csv
import sqlite3
import hashlib
import glob
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional, Union, Any
import logging
import pandas as pd
import numpy as np

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

note the original local datasets given to the LRC initially are hosted in the nahuatl_processing.db under sqLiteDb specifically under the table name: checkpoint_after_empty_p_tag_removal_20251002. For the scraped data, that is under scrapedDataDb under the nahuatl.db file. The schema is under config/ 

In [2]:
scraped_data_dir = Path("../../../data/scrapedDataDb/")
local_data_dir = Path("../../../data/sqLiteDb/")

if not scraped_data_dir.exists():
    raise FileNotFoundError(f"Scraped database not found: {scraped_data_dir}")
if not local_data_dir.exists():
    raise FileNotFoundError(f"Local database not found: {local_data_dir}")

# Database connection
scraped_db = sqlite3.connect(scraped_data_dir / "nahuatl.db")
local_db = sqlite3.connect(local_data_dir / "nahuatl_processing.db")


# read in the table(s), for the local_db it's only one table (actually two one for the WHP dataset and one for the IDIEZ dataset) while for the 
# scraped_db there are multiple tables due to the relationl structure we want to keep
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", local_db)
WHP_TABLE = "checkpoint_after_bibl_restoration_20251030"
IDIEZ_TABLE = "IDIEZ_modern_nahuatl-all-2024-03-27T09-45-31"

In [3]:
# Map scraped DB fields to local DB fields for WHP data
WHP_FIELD_MAPPING = {
    # Scraped field: Local field
    'node_id': 'Ref',
    'headword': 'Headword',
    'orthographic_variants': 'Orthographic Variants',
    'translation_english': 'Principal English Translation',
    'spanish_loanword': 'Spanish Loanword',
    # Authority fields (stored in authority_citations table in scraped DB)
    'authority_molina': 'Alonso de Molina',
    'authority_karttunen': 'Frances Karttunen',
    'authority_carochi': 'Horacio Carochi / English',
    'authority_olmos': 'Andrés de Olmos',
    'authority_lockhart': "Lockhart’s Nahuatl as Written",
    # Attestations (stored in attestations table in scraped DB)
    'attestations_english': 'Attestations from sources in English',
    'attestations_spanish': 'Attestations from sources in Spanish',
    # Metadata
    'themes': 'themes',
}

# Map scraped DB fields to local DB fields for IDIEZ data
IDIEZ_FIELD_MAPPING = {
    'node_id': 'Ref',
    'headword_idiez': 'tlahtolli',
    'translation_english_idiez': 'IDIEZ traduc. inglés',
    'definition_nahuatl_idiez': 'IDIEZ def. náhuatl',
    'definition_spanish_idiez': 'IDIEZ def. español',
    'morfologia_idiez': 'IDIEZ morfología',
    'gramatica_idiez': 'IDIEZ gramática',
}


In [4]:
def compare_dataframes(df_scraped, df_local, field_mapping, key_field="node_id"):

    scraped_key = field_mapping.get(key_field, key_field)
    df_scraped = df_scraped.copy()
    df_local = df_local.copy()
    df_scraped[key_field] = df_scraped[key_field].astype(str)
    df_local[scraped_key] = df_local[scraped_key].astype(str)
    df_local[scraped_key] = (
        df_local[scraped_key].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
    )

    merged = df_scraped.merge(
        df_local,
        left_on=key_field,
        right_on=scraped_key,
        how="inner",
        suffixes=("_scraped", "_local"),
    )
    print(f"Rows in scraped DB: {len(df_scraped):,}")
    print(f"Rows in local DB: {len(df_local):,}")
    print(f"Rows matched: {len(merged):,}")

    discrepancies = {
        "field_discrepancies": {},
        "total_discrepancies": 0,
        "rows_compared": len(merged),
        "sample_discrepancies": [],
    }

    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue
        if (
            scraped_field not in df_scraped.columns
            or local_field not in df_local.columns
        ):
            print(f"Skipping {scraped_field} (not in both datasets)")
            continue
        scraped_col = (
            f"{scraped_field}_scraped"
            if scraped_field in df_local.columns
            else scraped_field
        )
        local_col = (
            f"{local_field}_local" if local_field in df_scraped.columns else local_field
        )

        scraped_values = merged[scraped_col].fillna("").astype(str).str.strip()
        local_values = merged[local_col].fillna("").astype(str).str.strip()

        merged["diff"] = ~(
            (scraped_values == local_values)
            | (merged[scraped_col].isna() & merged[local_col].isna())
        )
        diff_count = merged["diff"].sum()

        if diff_count > 0:
            print(f"{scraped_field:30} {diff_count:>6,} discrepancies")
            discrepancies["field_discrepancies"][scraped_field] = {
                "count": int(diff_count),
                "local_field": local_field,
                "sample_rows": merged[merged["diff"]][key_field].head(10).tolist(),
            }
            discrepancies["total_discrepancies"] += int(diff_count)

            if len(discrepancies["sample_discrepancies"]) < 5:
                sample = merged[merged["diff"]].iloc[0]
                discrepancies["sample_discrepancies"].append(
                    {
                        "node_id": sample[key_field],
                        "field": scraped_field,
                        "scraped_value": str(sample[scraped_col])[:100],
                        "local_value": str(sample[local_col])[:100],
                    }
                )
        else:
            print(f"{scraped_field:30} all match")

    print(f"Total discrepancies: {discrepancies['total_discrepancies']:,}")
    return discrepancies


def create_update_dataframe(scraped_df, local_df, field_mapping, key_field="node_id"):
    """Create dataframe showing what needs updating"""
    scraped_key = field_mapping.get(key_field, key_field)

    scraped_df = scraped_df.copy()
    local_df = local_df.copy()
    scraped_df[key_field] = scraped_df[key_field].astype(str)
    local_df[scraped_key] = (
        local_df[scraped_key].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
    )

    merged = scraped_df.merge(
        local_df,
        left_on=key_field,
        right_on=scraped_key,
        how="inner",
        suffixes=("_scraped", "_local"),
    )

    updates = []

    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue

        if (
            scraped_field not in scraped_df.columns
            or local_field not in local_df.columns
        ):
            continue

        scraped_col = (
            f"{scraped_field}_scraped"
            if scraped_field in local_df.columns
            else scraped_field
        )
        local_col = (
            f"{local_field}_local" if local_field in scraped_df.columns else local_field
        )
        scraped_values = merged[scraped_col].fillna("").astype(str).str.strip()
        local_values = merged[local_col].fillna("").astype(str).str.strip()

        diff_mask = ~(
            (scraped_values == local_values)
            | (merged[scraped_col].isna() & merged[local_col].isna())
        )

        diff_rows = merged[diff_mask]

        for idx, row in diff_rows.iterrows():

            current_stripped = (
                str(row[scraped_col]).strip() if pd.notna(row[scraped_col]) else ""
            )
            new_stripped = (
                str(row[local_col]).strip() if pd.notna(row[local_col]) else ""
            )

            updates.append(
                {
                    "node_id": row[key_field],
                    "field": scraped_field,
                    "current_value": current_stripped,
                    "new_value": new_stripped,
                    "action": "UPDATE",
                }
            )

    return pd.DataFrame(updates)


def apply_updates(updates_df, conn, table_name="dictionary_entries", dry_run=True):

    print(f"APPLYING UPDATES TO {table_name}")
    print(f"Mode: {'DRY RUN' if dry_run else 'LIVE UPDATES'}")
    stats = {
        "total_updates": 0,
        "successful_updates": 0,
        "failed_updates": 0,
        "updates_by_field": defaultdict(int),
    }

    cursor = conn.cursor()
    grouped = updates_df.groupby("node_id")

    for node_id, group in grouped:
        try:
            set_clauses = []
            values = []

            for _, row in group.iterrows():
                set_clauses.append(f"{row['field']} = ?")
                values.append(row["new_value"])
                stats["updates_by_field"][row["field"]] += 1

            values.append(node_id)
            sql = f"UPDATE {table_name} SET {', '.join(set_clauses)} WHERE node_id = ?"

            if not dry_run:
                cursor.execute(sql, values)

            stats["successful_updates"] += len(group)
            stats["total_updates"] += len(group)

        except Exception as e:
            print(f"Error updating node_id {node_id}: {e}")
            stats["failed_updates"] += len(group)
            stats["total_updates"] += len(group)

    if not dry_run:
        conn.commit()
        print("Changes committed")
    else:
        print("Dry run complete - no changes made")

    print(f"\nStatistics:")
    print(f"  Total updates: {stats['total_updates']:,}")
    print(f"  Successful: {stats['successful_updates']:,}")
    print(f"  Failed: {stats['failed_updates']:,}")
    print(f"\nUpdates by field:")
    for field, count in stats["updates_by_field"].items():
        print(f"  - {field}: {count:,}")

    return stats

In [5]:
def strip_html_better(text):
    """More aggressive HTML stripping"""
    if pd.isna(text) or text == "":
        return ""
    text = str(text)

    text = re.sub(r"<[^>]+>", "", text)

    text = re.sub(r'\w+="[^"]*"', "", text)
    text = re.sub(r"\w+=\'[^\']*\'", "", text)

    text = " ".join(text.split())
    return text.strip()

for me it methodically makes sense to go down the tables that nahuat.db has (check config/schema.sql) and then check each one against the WHP_table_name and the IDIEZ_table_name as such let's begin with the largest tables first and build up from the tables. 
it would also be smart that as we cross validate we proceed to investigate where the cross references columns actually come from
also a side note since we've done no manual corrections or fixes to the IDIEZ fields we can begin with validating local IDIEZ with scraped IDIEZ data

In [6]:
scraped_idiez = pd.read_sql(
    """
    SELECT 
        node_id,
        headword_idiez,
        translation_english_idiez,
        definition_nahuatl_idiez,
        definition_spanish_idiez,
        morfologia_idiez,
        gramatica_idiez,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset IN ('IDIEZ', 'HYBRID')
""",
    scraped_db,
)

print(f"Loaded {len(scraped_idiez):,} IDIEZ/HYBRID entries from scraped DB")

# Load IDIEZ from local DB
local_idiez = pd.read_sql(
    f"""
    SELECT 
        Ref,
        tlahtolli,
        "IDIEZ traduc. inglés",
        "IDIEZ def. náhuatl",
        "IDIEZ def. español",
        "IDIEZ morfología",
        "IDIEZ gramática"
    FROM [{IDIEZ_TABLE}]
""",
    local_db,
)

print(f"Loaded {len(local_idiez):,} IDIEZ entries from local DB")

# Compare
idiez_discrepancies = compare_dataframes(
    scraped_idiez, local_idiez, IDIEZ_FIELD_MAPPING, key_field="node_id"
)

# Show samples
if idiez_discrepancies["sample_discrepancies"]:
    print("\nSample IDIEZ discrepancies:")
    for i, sample in enumerate(idiez_discrepancies["sample_discrepancies"][:5], 1):
        print(f"\n{i}. node_id={sample['node_id']}, field={sample['field']}")
        print(f"   Scraped: {sample['scraped_value']}")
        print(f"   Local:   {sample['local_value']}")

Loaded 6,846 IDIEZ/HYBRID entries from scraped DB
Loaded 6,846 IDIEZ entries from local DB
Rows in scraped DB: 6,846
Rows in local DB: 6,846
Rows matched: 6,844
headword_idiez                 all match
translation_english_idiez          11 discrepancies
definition_nahuatl_idiez           24 discrepancies
definition_spanish_idiez          425 discrepancies
morfologia_idiez               all match
gramatica_idiez                all match
Total discrepancies: 460

Sample IDIEZ discrepancies:

1. node_id=187252, field=translation_english_idiez
   Scraped: 
   Local:   to transport rocks.

2. node_id=176130, field=definition_nahuatl_idiez
   Scraped: ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl
   Local:   ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl

3. node_id=172003, field=definition_spanish_idiez
   Scraped: A.1. se enfrìa. “Se enfría la tierra de noche. 2. Se va la luz. “Cuan

In [7]:
# creating the IDIEZ report
if idiez_discrepancies["total_discrepancies"] > 0:
    print("\nCreating IDIEZ update report...")

    idiez_updates = create_update_dataframe(
        scraped_idiez, local_idiez, IDIEZ_FIELD_MAPPING
    )

    print(f"Total IDIEZ updates needed: {len(idiez_updates):,}")
    print("\nUpdates by field:")
    print(idiez_updates["field"].value_counts())

    # idiez_updates.to_csv('idiez_updates_needed.csv', index=False, encoding='utf-8-sig')
    print("\nUpdate report saved to: idiez_updates_needed.csv")
else:
    print("\nNo IDIEZ updates needed - data matches perfectly!")
    idiez_updates = pd.DataFrame()


Creating IDIEZ update report...
Total IDIEZ updates needed: 460

Updates by field:
field
definition_spanish_idiez     425
definition_nahuatl_idiez      24
translation_english_idiez     11
Name: count, dtype: int64

Update report saved to: idiez_updates_needed.csv


In [8]:
scraped_whp = pd.read_sql(
    """
    SELECT 
        node_id,
        headword,
        orthographic_variants,
        translation_english,
        spanish_loanword,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset = 'WHP'
""",
    scraped_db,
)
print(f"Loaded {len(scraped_whp):,} WHP entries from scraped DB")

local_whp = pd.read_sql(
    f"""
    SELECT 
        Ref,
        Headword,
        "Orthographic Variants",
        "Principal English Translation",
        "Attestations from sources in English",
        "Attestations from sources in Spanish",
        "Alonso de Molina",
        "Frances Karttunen",
        "Horacio Carochi / English",
        "Andrés de Olmos",
        "Lockhart’s Nahuatl as Written",
        "themes",
        "Spanish Loanword",
        "Citations",
        "Number_of_Citations",
        "Cross_References",
        "Number_of_Cross_References",
        "CrossRef_Types"
    FROM [{WHP_TABLE}]
""",
    local_db,
)
print(f"Loaded {len(local_whp):,} WHP entries from local DB")

Loaded 31,742 WHP entries from scraped DB
Loaded 31,806 WHP entries from local DB


so as u can see from the output some of the changes seen are html differences so we need to look at the actual content of the cells to see if there is any difference

In [9]:
# Prepare node_id sets for comparison
scraped_ids = set(scraped_whp["node_id"].astype(str))
local_ids = set(
    local_whp["Ref"].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
)

# Find missing entries
missing_in_scraped = local_ids - scraped_ids
missing_in_local = scraped_ids - local_ids

print(f"\nEntries in LOCAL but NOT in SCRAPED: {len(missing_in_scraped):,}")
print(f"Entries in SCRAPED but NOT in LOCAL: {len(missing_in_local):,}")

if missing_in_scraped:
    missing_scraped_df = local_whp[
        local_whp["Ref"]
        .astype(str)
        .str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
        .isin(missing_in_scraped)
    ]
    missing_scraped_df.to_csv('./whp_missing_in_scraped.csv', index=False, encoding='utf-8-sig')
    print(f"\nSaved {len(missing_scraped_df):,} entries to whp_missing_in_scraped.csv")
    
if missing_in_local:
    # Ensure types match
    missing_local_df = scraped_whp[
        scraped_whp["node_id"].astype(str).isin(missing_in_local)
    ]
    print(f"After filtering: {len(missing_local_df)} rows")

    if len(missing_local_df) > 0:
        missing_local_df.to_csv('./whp_missing_in_local.csv', index=False, encoding='utf-8-sig')
        print(f"Saved {len(missing_local_df):,} entries to whp_missing_in_local.csv")
        print(f"\nSample entries:")
        print(missing_local_df[["node_id", "headword"]].head(10))
    else:
        print("No matching rows found - type mismatch issue")


Entries in LOCAL but NOT in SCRAPED: 340
Entries in SCRAPED but NOT in LOCAL: 276

Saved 340 entries to whp_missing_in_scraped.csv
After filtering: 276 rows
Saved 276 entries to whp_missing_in_local.csv

Sample entries:
       node_id       headword
31466   211071   yecaxochitl.
31467   211072     texochitl.
31468   211073        teicui.
31469   211074  Itzcahuatzin.
31470   211075  Itzehecatzin.
31471   211076   Tlamatzinco.
31472   211077     amo ihual.
31473   211078    Techahuatl.
31474   211079     hualiloti.
31475   211080     Cualaztli.


In [10]:
# Helper functions
def get_authority_citations(node_id, db_conn):
    """Get all authority citations for a node, grouped by authority"""
    auth_data = pd.read_sql(
        f"""
        SELECT authority_name, citation_text, citation_order
        FROM authority_citations
        WHERE node_id = '{node_id}'
        ORDER BY authority_name, citation_order
        """,
        db_conn,
    )

    result = {}
    for auth_name in ["Molina", "Karttunen", "Carochi", "Olmos", "Lockhart"]:
        auth_rows = auth_data[auth_data["authority_name"] == auth_name]
        if not auth_rows.empty:
            result[auth_name] = " | ".join(auth_rows["citation_text"].tolist())
        else:
            result[auth_name] = None

    return result


def get_attestations(node_id, db_conn):
    """Get attestations for a node, grouped by language"""
    attest_data = pd.read_sql(
        f"""
        SELECT language, attestation_text
        FROM attestations
        WHERE node_id = '{node_id}'
        """,
        db_conn,
    )

    result = {"English": None, "Spanish": None}

    for _, row in attest_data.iterrows():
        lang = row["language"]
        if lang in result:
            if result[lang] is None:
                result[lang] = row["attestation_text"]
            else:
                result[lang] += " | " + row["attestation_text"]

    return result


def get_cross_references(node_id, db_conn):
    """Get cross-references for a node"""
    xref_data = pd.read_sql(
        f"""
        SELECT target_node_id, reference_type
        FROM entry_cross_references
        WHERE source_node_id = '{node_id}'
        """,
        db_conn,
    )

    if xref_data.empty:
        return None

    return " | ".join(xref_data["target_node_id"].astype(str).tolist())


def strip_div_br_tags(text):
    """Strip div and br tags but keep everything else intact"""
    if pd.isna(text) or text == "":
        return ""
    text = str(text)

    # Remove div tags with any attributes
    text = re.sub(r"<div[^>]*>", "", text)
    text = re.sub(r"</div>", "", text)

    # Remove br tags
    text = re.sub(r"<br\s*/?>", "", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # Remove whitespace before closing tags and after opening tags
    text = re.sub(r"\s+</", "</", text)
    text = re.sub(r">\s+", ">", text)

    text = text.strip()

    return text


def strip_punctuation_for_comparison(text):
    """Remove common punctuation for comparison"""
    if pd.isna(text) or text == "":
        return ""
    text = str(text)
    # Remove common punctuation
    text = re.sub(r"[.,;:!?]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# Load base scraped data
scraped_enriched = pd.read_sql(
    """
    SELECT 
        de.node_id,
        de.headword,
        de.orthographic_variants,
        de.translation_english,
        de.spanish_loanword,
        de.source_dataset,
        de.url_alias
    FROM dictionary_entries de
    WHERE de.source_dataset = 'WHP'
    """,
    scraped_db,
)

# Enrich with authority citations, attestations, and cross-references
auth_cols = [
    "scraped_molina",
    "scraped_karttunen",
    "scraped_carochi",
    "scraped_olmos",
    "scraped_lockhart",
]
attest_cols = ["scraped_attest_english", "scraped_attest_spanish"]

for col in auth_cols + attest_cols + ["scraped_crossrefs"]:
    scraped_enriched[col] = None

for idx, row in scraped_enriched.iterrows():
    node_id = row["node_id"]

    auth_cites = get_authority_citations(node_id, scraped_db)
    scraped_enriched.at[idx, "scraped_molina"] = auth_cites["Molina"] #type: ignore
    scraped_enriched.at[idx, "scraped_karttunen"] = auth_cites["Karttunen"] #type: ignore
    scraped_enriched.at[idx, "scraped_carochi"] = auth_cites["Carochi"] #type: ignore
    scraped_enriched.at[idx, "scraped_olmos"] = auth_cites["Olmos"] #type: ignore
    scraped_enriched.at[idx, "scraped_lockhart"] = auth_cites["Lockhart"] #type: ignore

    attests = get_attestations(node_id, scraped_db)
    scraped_enriched.at[idx, "scraped_attest_english"] = attests["English"] #type: ignore
    scraped_enriched.at[idx, "scraped_attest_spanish"] = attests["Spanish"] #type: ignore

    xrefs = get_cross_references(node_id, scraped_db)
    scraped_enriched.at[idx, "scraped_crossrefs"] = xrefs #type: ignore
WHP_TABLE_PUNCT_FIXED = "checkpoint_llm_validated_20251030"

# Load local data with all columns
local_enriched = pd.read_sql(
    f"""
    SELECT 
        Ref,
        Headword,
        "Orthographic Variants",
        "Principal English Translation",
        "Spanish Loanword",
        "Attestations from sources in English",
        "Attestations from sources in Spanish",
        "Alonso de Molina",
        "Frances Karttunen",
        "Horacio Carochi / English",
        "Andrés de Olmos",
        "Lockhart’s Nahuatl as Written",
        "Citations",
        "Cross_References"
    FROM [{WHP_TABLE_PUNCT_FIXED}]
    """,
    local_db,
)

# Prepare for merge
scraped_enriched["node_id"] = scraped_enriched["node_id"].astype(str)
local_enriched["Ref"] = (
    local_enriched["Ref"].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
)

# Merge datasets
merged = scraped_enriched.merge(
    local_enriched,
    left_on="node_id",
    right_on="Ref",
    how="inner",
    suffixes=("_scraped", "_local"),
)

# Apply cleaning to both datasets
merged["scraped_translation_clean"] = merged["translation_english"].apply(
    strip_div_br_tags
)
merged["local_translation_clean"] = merged["Principal English Translation"].apply(
    strip_div_br_tags
)

merged["scraped_molina_clean"] = merged["scraped_molina"].apply(strip_div_br_tags)
merged["local_molina_clean"] = merged["Alonso de Molina"].apply(strip_div_br_tags)

merged["scraped_karttunen_clean"] = merged["scraped_karttunen"].apply(strip_div_br_tags)
merged["local_karttunen_clean"] = merged["Frances Karttunen"].apply(strip_div_br_tags)

merged["scraped_carochi_clean"] = merged["scraped_carochi"].apply(strip_div_br_tags)
merged["local_carochi_clean"] = merged["Horacio Carochi / English"].apply(
    strip_div_br_tags
)

merged["scraped_olmos_clean"] = merged["scraped_olmos"].apply(strip_div_br_tags)
merged["local_olmos_clean"] = merged["Andrés de Olmos"].apply(strip_div_br_tags)

merged["scraped_lockhart_clean"] = merged["scraped_lockhart"].apply(strip_div_br_tags)
merged["local_lockhart_clean"] = merged["Lockhart’s Nahuatl as Written"].apply(
    strip_div_br_tags
)

merged["scraped_attest_english_clean"] = merged["scraped_attest_english"].apply(
    strip_div_br_tags
)
merged["local_attest_english_clean"] = merged[
    "Attestations from sources in English"
].apply(strip_div_br_tags)

merged["scraped_attest_spanish_clean"] = merged["scraped_attest_spanish"].apply(
    strip_div_br_tags
)
merged["local_attest_spanish_clean"] = merged[
    "Attestations from sources in Spanish"
].apply(strip_div_br_tags)

# Report 1: Translation differences
translation_diff = merged[
    (merged["scraped_translation_clean"] != merged["local_translation_clean"])
    & (merged["local_translation_clean"] != "None")
].copy()

translation_report = translation_diff[
    [
        "node_id",
        "headword",
        "orthographic_variants",
        "url_alias",
        "scraped_translation_clean",
        "local_translation_clean",
    ]
]

translation_report.to_csv(
    "./report_translation_mismatches.csv", index=False, encoding="utf-8-sig"
)
print(f"Translation mismatches: {len(translation_report)}")

# Report 2: Authority citation differences
authority_diffs = []

for idx, row in merged.iterrows():
    authorities = [
        ("Molina", "scraped_molina_clean", "local_molina_clean"),
        ("Karttunen", "scraped_karttunen_clean", "local_karttunen_clean"),
        ("Carochi", "scraped_carochi_clean", "local_carochi_clean"),
        ("Olmos", "scraped_olmos_clean", "local_olmos_clean"),
        ("Lockhart", "scraped_lockhart_clean", "local_lockhart_clean"),
    ]

    for auth_name, scraped_col, local_col in authorities:
        if row[scraped_col] != row[local_col] and str(row[local_col]).strip() != "None":

            # Check if only difference is punctuation
            scraped_no_punct = strip_punctuation_for_comparison(row[scraped_col])
            local_no_punct = strip_punctuation_for_comparison(row[local_col])

            is_punctuation_only = scraped_no_punct == local_no_punct

            authority_diffs.append(
                {
                    "node_id": row["node_id"],
                    "headword": row["headword"],
                    "url_alias": row["url_alias"],
                    "authority": auth_name,
                    "scraped_value": row[scraped_col],
                    "local_value": row[local_col],
                    "punctuation_only_diff": is_punctuation_only,
                    "recommended_action": (
                        "use_scraped" if is_punctuation_only else "manual_review"
                    ),
                }
            )

if authority_diffs:
    authority_report = pd.DataFrame(authority_diffs)
    authority_report.to_csv(
        "./report_authority_mismatches.csv", index=False, encoding="utf-8-sig"
    )

    punct_only_count = authority_report["punctuation_only_diff"].sum()
    print(f"Authority citation mismatches: {len(authority_report)}")
    print(f"  - Punctuation-only differences: {punct_only_count}")
    print(f"  - Other differences: {len(authority_report) - punct_only_count}")
else:
    print("Authority citation mismatches: 0")

# Report 3: Attestation differences
attestation_diffs = []

for idx, row in merged.iterrows():
    attestations = [
        ("English", "scraped_attest_english_clean", "local_attest_english_clean"),
        ("Spanish", "scraped_attest_spanish_clean", "local_attest_spanish_clean"),
    ]

    for lang, scraped_col, local_col in attestations:
        if row[scraped_col] != row[local_col] and str(row[local_col]).strip() != "None":

            # Check if only difference is punctuation
            scraped_no_punct = strip_punctuation_for_comparison(row[scraped_col])
            local_no_punct = strip_punctuation_for_comparison(row[local_col])

            is_punctuation_only = scraped_no_punct == local_no_punct

            attestation_diffs.append(
                {
                    "node_id": row["node_id"],
                    "headword": row["headword"],
                    "url_alias": row["url_alias"],
                    "language": lang,
                    "scraped_value": row[scraped_col],
                    "local_value": row[local_col],
                    "punctuation_only_diff": is_punctuation_only,
                    "recommended_action": (
                        "use_scraped" if is_punctuation_only else "manual_review"
                    ),
                }
            )

if attestation_diffs:
    attestation_report = pd.DataFrame(attestation_diffs)
    attestation_report.to_csv(
        "./report_attestation_mismatches.csv", index=False, encoding="utf-8-sig"
    )

    punct_only_count = attestation_report["punctuation_only_diff"].sum()
    print(f"Attestation mismatches: {len(attestation_report)}")
    print(f"  - Punctuation-only differences: {punct_only_count}")
    print(f"  - Other differences: {len(attestation_report) - punct_only_count}")
else:
    print("Attestation mismatches: 0")

print("\nReports generated:")
print("  - report_translation_mismatches.csv")
print("  - report_authority_mismatches.csv")
print("  - report_attestation_mismatches.csv")

Translation mismatches: 23
Authority citation mismatches: 292
  - Punctuation-only differences: 6
  - Other differences: 286
Attestation mismatches: 369
  - Punctuation-only differences: 0
  - Other differences: 369

Reports generated:
  - report_translation_mismatches.csv
  - report_authority_mismatches.csv
  - report_attestation_mismatches.csv


In [11]:
# scraped_db.close()
# local_db.close()
# print("Database connections closed")