# Validation and correction

As we've continued working with the scraped data versus the changes we've made from the initial csv files given to the LRC. We've done validation and corrections to the initial csv files that are not reflected in the scraped datasets. So now since we've uploaded our data into SQLite the next step will be to cross validate and update the SQLite. Which will be done below

In [110]:
## setup and imports
import os
import re
import csv
import sqlite3
import hashlib
import glob
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional, Union, Any
import logging
import pandas as pd
import numpy as np

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

note the original local datasets given to the LRC initially are hosted in the nahuatl_processing.db under sqLiteDb specifically under the table name: checkpoint_after_empty_p_tag_removal_20251002. For the scraped data, that is under scrapedDataDb under the nahuatl.db file. The schema is under config/ 

In [111]:
scraped_data_dir = Path("../../../data/scrapedDataDb/")
local_data_dir = Path("../../../data/sqLiteDb/")

if not scraped_data_dir.exists():
    raise FileNotFoundError(f"Scraped database not found: {scraped_data_dir}")
if not local_data_dir.exists():
    raise FileNotFoundError(f"Local database not found: {local_data_dir}")

# Database connection
scraped_db = sqlite3.connect(scraped_data_dir / "nahuatl.db")
local_db = sqlite3.connect(local_data_dir / "nahuatl_processing.db")


# read in the table(s), for the local_db it's only one table (actually two one for the WHP dataset and one for the IDIEZ dataset) while for the 
# scraped_db there are multiple tables due to the relationl structure we want to keep
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", local_db)
WHP_TABLE = "checkpoint_after_empty_p_tag_removal_20251002"
IDIEZ_TABLE = "IDIEZ_modern_nahuatl-all-2024-03-27T09-45-31"

In [112]:
# Map scraped DB fields to local DB fields for WHP data
WHP_FIELD_MAPPING = {
    # Scraped field: Local field
    'node_id': 'Ref',
    'headword': 'Headword',
    'orthographic_variants': 'Orthographic Variants',
    'translation_english': 'Principal English Translation',
    'spanish_loanword': 'Spanish Loanword',
    # Authority fields (stored in authority_citations table in scraped DB)
    'authority_molina': 'Alonso de Molina',
    'authority_karttunen': 'Frances Karttunen',
    'authority_carochi': 'Horacio Carochi / English',
    'authority_olmos': 'Andrés de Olmos',
    'authority_lockhart': "Lockhart’s Nahuatl as Written",
    # Attestations (stored in attestations table in scraped DB)
    'attestations_english': 'Attestations from sources in English',
    'attestations_spanish': 'Attestations from sources in Spanish',
    # Metadata
    'themes': 'themes',
}

# Map scraped DB fields to local DB fields for IDIEZ data
IDIEZ_FIELD_MAPPING = {
    'node_id': 'Ref',
    'headword_idiez': 'tlahtolli',
    'translation_english_idiez': 'IDIEZ traduc. inglés',
    'definition_nahuatl_idiez': 'IDIEZ def. náhuatl',
    'definition_spanish_idiez': 'IDIEZ def. español',
    'morfologia_idiez': 'IDIEZ morfología',
    'gramatica_idiez': 'IDIEZ gramática',
}


In [113]:
def compare_dataframes(df_scraped, df_local, field_mapping, key_field="node_id"):

    scraped_key = field_mapping.get(key_field, key_field)
    df_scraped = df_scraped.copy()
    df_local = df_local.copy()
    df_scraped[key_field] = df_scraped[key_field].astype(str)
    df_local[scraped_key] = df_local[scraped_key].astype(str)
    df_local[scraped_key] = (
        df_local[scraped_key].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
    )

    merged = df_scraped.merge(
        df_local,
        left_on=key_field,
        right_on=scraped_key,
        how="inner",
        suffixes=("_scraped", "_local"),
    )
    print(f"Rows in scraped DB: {len(df_scraped):,}")
    print(f"Rows in local DB: {len(df_local):,}")
    print(f"Rows matched: {len(merged):,}")

    discrepancies = {
        "field_discrepancies": {},
        "total_discrepancies": 0,
        "rows_compared": len(merged),
        "sample_discrepancies": [],
    }

    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue
        if (
            scraped_field not in df_scraped.columns
            or local_field not in df_local.columns
        ):
            print(f"Skipping {scraped_field} (not in both datasets)")
            continue
        scraped_col = (
            f"{scraped_field}_scraped"
            if scraped_field in df_local.columns
            else scraped_field
        )
        local_col = (
            f"{local_field}_local" if local_field in df_scraped.columns else local_field
        )

        scraped_values = merged[scraped_col].fillna("").astype(str).str.strip()
        local_values = merged[local_col].fillna("").astype(str).str.strip()

        merged["diff"] = ~(
            (scraped_values == local_values)
            | (merged[scraped_col].isna() & merged[local_col].isna())
        )
        diff_count = merged["diff"].sum()

        if diff_count > 0:
            print(f"{scraped_field:30} {diff_count:>6,} discrepancies")
            discrepancies["field_discrepancies"][scraped_field] = {
                "count": int(diff_count),
                "local_field": local_field,
                "sample_rows": merged[merged["diff"]][key_field].head(10).tolist(),
            }
            discrepancies["total_discrepancies"] += int(diff_count)

            if len(discrepancies["sample_discrepancies"]) < 5:
                sample = merged[merged["diff"]].iloc[0]
                discrepancies["sample_discrepancies"].append(
                    {
                        "node_id": sample[key_field],
                        "field": scraped_field,
                        "scraped_value": str(sample[scraped_col])[:100],
                        "local_value": str(sample[local_col])[:100],
                    }
                )
        else:
            print(f"{scraped_field:30} all match")

    print(f"Total discrepancies: {discrepancies['total_discrepancies']:,}")
    return discrepancies


def create_update_dataframe(scraped_df, local_df, field_mapping, key_field="node_id"):
    """Create dataframe showing what needs updating"""
    scraped_key = field_mapping.get(key_field, key_field)

    scraped_df = scraped_df.copy()
    local_df = local_df.copy()
    scraped_df[key_field] = scraped_df[key_field].astype(str)
    local_df[scraped_key] = (
        local_df[scraped_key].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
    )

    merged = scraped_df.merge(
        local_df,
        left_on=key_field,
        right_on=scraped_key,
        how="inner",
        suffixes=("_scraped", "_local"),
    )

    updates = []

    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue

        if (
            scraped_field not in scraped_df.columns
            or local_field not in local_df.columns
        ):
            continue

        scraped_col = (
            f"{scraped_field}_scraped"
            if scraped_field in local_df.columns
            else scraped_field
        )
        local_col = (
            f"{local_field}_local" if local_field in scraped_df.columns else local_field
        )
        scraped_values = merged[scraped_col].fillna("").astype(str).str.strip()
        local_values = merged[local_col].fillna("").astype(str).str.strip()

        diff_mask = ~(
            (scraped_values == local_values)
            | (merged[scraped_col].isna() & merged[local_col].isna())
        )

        diff_rows = merged[diff_mask]

        for idx, row in diff_rows.iterrows():

            current_stripped = (
                str(row[scraped_col]).strip() if pd.notna(row[scraped_col]) else ""
            )
            new_stripped = (
                str(row[local_col]).strip() if pd.notna(row[local_col]) else ""
            )

            updates.append(
                {
                    "node_id": row[key_field],
                    "field": scraped_field,
                    "current_value": current_stripped,
                    "new_value": new_stripped,
                    "action": "UPDATE",
                }
            )

    return pd.DataFrame(updates)


def apply_updates(updates_df, conn, table_name="dictionary_entries", dry_run=True):

    print(f"APPLYING UPDATES TO {table_name}")
    print(f"Mode: {'DRY RUN' if dry_run else 'LIVE UPDATES'}")
    stats = {
        "total_updates": 0,
        "successful_updates": 0,
        "failed_updates": 0,
        "updates_by_field": defaultdict(int),
    }

    cursor = conn.cursor()
    grouped = updates_df.groupby("node_id")

    for node_id, group in grouped:
        try:
            set_clauses = []
            values = []

            for _, row in group.iterrows():
                set_clauses.append(f"{row['field']} = ?")
                values.append(row["new_value"])
                stats["updates_by_field"][row["field"]] += 1

            values.append(node_id)
            sql = f"UPDATE {table_name} SET {', '.join(set_clauses)} WHERE node_id = ?"

            if not dry_run:
                cursor.execute(sql, values)

            stats["successful_updates"] += len(group)
            stats["total_updates"] += len(group)

        except Exception as e:
            print(f"Error updating node_id {node_id}: {e}")
            stats["failed_updates"] += len(group)
            stats["total_updates"] += len(group)

    if not dry_run:
        conn.commit()
        print("Changes committed")
    else:
        print("Dry run complete - no changes made")

    print(f"\nStatistics:")
    print(f"  Total updates: {stats['total_updates']:,}")
    print(f"  Successful: {stats['successful_updates']:,}")
    print(f"  Failed: {stats['failed_updates']:,}")
    print(f"\nUpdates by field:")
    for field, count in stats["updates_by_field"].items():
        print(f"  - {field}: {count:,}")

    return stats

In [114]:
def find_bibl_tags(text):
    """Find all <bibl> tags and their positions"""
    if pd.isna(text) or text == "":
        return []

    pattern = re.compile(r"<bibl[^>]*>.*?</bibl>", re.DOTALL | re.IGNORECASE)
    matches = []
    for match in pattern.finditer(str(text)):
        matches.append(
            {
                "full_tag": match.group(0),
                "start": match.start(),
                "end": match.end(),
                "before_text": str(text)[max(0, match.start() - 30) : match.start()],
                "after_text": str(text)[match.end() : min(len(text), match.end() + 30)],
            }
        )
    return matches


def strip_html_better(text):
    """More aggressive HTML stripping"""
    if pd.isna(text) or text == "":
        return ""
    text = str(text)

    text = re.sub(r"<[^>]+>", "", text)

    text = re.sub(r'\w+="[^"]*"', "", text)
    text = re.sub(r"\w+=\'[^\']*\'", "", text)

    text = " ".join(text.split())
    return text.strip()


def find_crossref_positions_in_scraped_improved(scraped_text):
    """Find cross-references with better context extraction"""
    if pd.isna(scraped_text):
        return []

    pattern = re.compile(r"\(see\s+([^)]+)\)", re.IGNORECASE)
    positions = []

    clean_text = strip_html_better(scraped_text)

    for match in pattern.finditer(clean_text):
        content = match.group(1).strip()

        before_start = max(0, match.start() - 30)
        before_text = clean_text[before_start : match.start()].strip()

        words_before = before_text.split()
        anchor = (
            " ".join(words_before[-5:])
            if len(words_before) >= 5
            else " ".join(words_before)
        )

        positions.append(
            {
                "full_match": match.group(0),
                "content": content,
                "anchor_before": anchor,
                "start": match.start(),
                "end": match.end(),
            }
        )

    return positions


def reinsert_crossref_improved(local_text, anchor_text, crossref_text):
    """Insert cross-ref with better matching"""
    if pd.isna(local_text):
        return local_text, False

    local_str = str(local_text)

    local_stripped = strip_html_better(local_str)
    anchor_stripped = strip_html_better(anchor_text)

    if anchor_stripped and anchor_stripped in local_stripped:

        anchor_words = anchor_stripped.split()
        if anchor_words:
            last_word = anchor_words[-1].strip(".,;:!?")

            word_pos = local_str.rfind(last_word)
            if word_pos != -1:

                insertion_point = word_pos + len(last_word)

                if not crossref_text.startswith("("):
                    crossref_text = f" (see {crossref_text})"
                else:
                    crossref_text = f" {crossref_text}"

                reinserted = (
                    local_str[:insertion_point]
                    + crossref_text
                    + local_str[insertion_point:]
                )
                return reinserted, True

    if "</p>" in local_str:
        insertion_point = local_str.rfind("</p>")
    else:
        insertion_point = len(local_str)

    if not crossref_text.startswith("("):
        crossref_text = f" (see {crossref_text})"
    else:
        crossref_text = f" {crossref_text}"

    reinserted = (
        local_str[:insertion_point] + crossref_text + local_str[insertion_point:]
    )
    return reinserted, True

for me it methodically makes sense to go down the tables that nahuat.db has (check config/schema.sql) and then check each one against the WHP_table_name and the IDIEZ_table_name as such let's begin with the largest tables first and build up from the tables. 
it would also be smart that as we cross validate we proceed to investigate where the cross references columns actually come from
also a side note since we've done no manual corrections or fixes to the IDIEZ fields we can begin with validating local IDIEZ with scraped IDIEZ data

In [115]:
scraped_idiez = pd.read_sql(
    """
    SELECT 
        node_id,
        headword_idiez,
        translation_english_idiez,
        definition_nahuatl_idiez,
        definition_spanish_idiez,
        morfologia_idiez,
        gramatica_idiez,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset IN ('IDIEZ', 'HYBRID')
""",
    scraped_db,
)

print(f"Loaded {len(scraped_idiez):,} IDIEZ/HYBRID entries from scraped DB")

# Load IDIEZ from local DB
local_idiez = pd.read_sql(
    f"""
    SELECT 
        Ref,
        tlahtolli,
        "IDIEZ traduc. inglés",
        "IDIEZ def. náhuatl",
        "IDIEZ def. español",
        "IDIEZ morfología",
        "IDIEZ gramática"
    FROM [{IDIEZ_TABLE}]
""",
    local_db,
)

print(f"Loaded {len(local_idiez):,} IDIEZ entries from local DB")

# Compare
idiez_discrepancies = compare_dataframes(
    scraped_idiez, local_idiez, IDIEZ_FIELD_MAPPING, key_field="node_id"
)

# Show samples
if idiez_discrepancies["sample_discrepancies"]:
    print("\nSample IDIEZ discrepancies:")
    for i, sample in enumerate(idiez_discrepancies["sample_discrepancies"][:5], 1):
        print(f"\n{i}. node_id={sample['node_id']}, field={sample['field']}")
        print(f"   Scraped: {sample['scraped_value']}")
        print(f"   Local:   {sample['local_value']}")

Loaded 6,846 IDIEZ/HYBRID entries from scraped DB
Loaded 6,846 IDIEZ entries from local DB
Rows in scraped DB: 6,846
Rows in local DB: 6,846
Rows matched: 6,844
headword_idiez                 all match
translation_english_idiez          11 discrepancies
definition_nahuatl_idiez           24 discrepancies
definition_spanish_idiez          425 discrepancies
morfologia_idiez               all match
gramatica_idiez                all match
Total discrepancies: 460

Sample IDIEZ discrepancies:

1. node_id=187252, field=translation_english_idiez
   Scraped: 
   Local:   to transport rocks.

2. node_id=176130, field=definition_nahuatl_idiez
   Scraped: ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl
   Local:   ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl

3. node_id=172003, field=definition_spanish_idiez
   Scraped: A.1. se enfrìa. “Se enfría la tierra de noche. 2. Se va la luz. “Cuan

In [116]:
# creating the IDIEZ report
if idiez_discrepancies["total_discrepancies"] > 0:
    print("\nCreating IDIEZ update report...")

    idiez_updates = create_update_dataframe(
        scraped_idiez, local_idiez, IDIEZ_FIELD_MAPPING
    )

    print(f"Total IDIEZ updates needed: {len(idiez_updates):,}")
    print("\nUpdates by field:")
    print(idiez_updates["field"].value_counts())

    # idiez_updates.to_csv('idiez_updates_needed.csv', index=False, encoding='utf-8-sig')
    print("\nUpdate report saved to: idiez_updates_needed.csv")
else:
    print("\nNo IDIEZ updates needed - data matches perfectly!")
    idiez_updates = pd.DataFrame()


Creating IDIEZ update report...
Total IDIEZ updates needed: 460

Updates by field:
field
definition_spanish_idiez     425
definition_nahuatl_idiez      24
translation_english_idiez     11
Name: count, dtype: int64

Update report saved to: idiez_updates_needed.csv


In [117]:
scraped_whp = pd.read_sql(
    """
    SELECT 
        node_id,
        headword,
        orthographic_variants,
        translation_english,
        spanish_loanword,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset = 'WHP'
""",
    scraped_db,
)
print(f"Loaded {len(scraped_whp):,} WHP entries from scraped DB")

local_whp = pd.read_sql(
    f"""
    SELECT 
        Ref,
        Headword,
        "Orthographic Variants",
        "Principal English Translation",
        "Attestations from sources in English",
        "Attestations from sources in Spanish",
        "Alonso de Molina",
        "Frances Karttunen",
        "Horacio Carochi / English",
        "Andrés de Olmos",
        "Lockhart’s Nahuatl as Written",
        "themes",
        "Spanish Loanword",
        "Citations",
        "Number_of_Citations",
        "Cross_References",
        "Number_of_Cross_References",
        "CrossRef_Types"
    FROM [{WHP_TABLE}]
""",
    local_db,
)
print(f"Loaded {len(local_whp):,} WHP entries from local DB")
print("COMPARING WHP DATA")

whp_field_mapping = {
    "node_id": "Ref",
    "headword": "Headword",
    "orthographic_variants": "Orthographic Variants",
    "translation_english": "Principal English Translation",
    "spanish_loanword": "Spanish Loanword",
}

whp_discrepancies = compare_dataframes(
    scraped_whp, local_whp, whp_field_mapping, key_field="node_id"
)

if whp_discrepancies["sample_discrepancies"]:
    print("\nSample WHP discrepancies:")
    for i, sample in enumerate(whp_discrepancies["sample_discrepancies"][:5], 1):
        print(f"\n{i}. node_id={sample['node_id']}, field={sample['field']}")
        print(f"   Scraped: {sample['scraped_value']}")
        print(f"   Local:   {sample['local_value']}")

print("WHP COMPARISON COMPLETE")
print(f"Total discrepancies found: {whp_discrepancies['total_discrepancies']:,}")
print(f"Fields with discrepancies: {len(whp_discrepancies['field_discrepancies'])}")

Loaded 31,742 WHP entries from scraped DB
Loaded 31,806 WHP entries from local DB
COMPARING WHP DATA
Rows in scraped DB: 31,742
Rows in local DB: 31,806
Rows matched: 31,466
headword                           76 discrepancies
orthographic_variants             187 discrepancies
translation_english            31,466 discrepancies
spanish_loanword               31,466 discrepancies
Total discrepancies: 63,195

Sample WHP discrepancies:

1. node_id=172881, field=headword
   Scraped: -icxitlan.
   Local:   -icxtlan.

2. node_id=171987, field=orthographic_variants
   Scraped: canah
   Local:   None

3. node_id=171879, field=translation_english
   Scraped: <div class="field-item even"><p>perhaps not (adverb) (see Molina)</p>
</div>
   Local:   <p>perhaps not (adverb)</p>

4. node_id=171879, field=spanish_loanword
   Scraped: 
   Local:   No
WHP COMPARISON COMPLETE
Total discrepancies found: 63,195
Fields with discrepancies: 4


In [118]:
# Prepare node_id sets for comparison
scraped_ids = set(scraped_whp["node_id"].astype(str))
local_ids = set(
    local_whp["Ref"].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
)

# Find missing entries
missing_in_scraped = local_ids - scraped_ids
missing_in_local = scraped_ids - local_ids

print(f"\nEntries in LOCAL but NOT in SCRAPED: {len(missing_in_scraped):,}")
print(f"Entries in SCRAPED but NOT in LOCAL: {len(missing_in_local):,}")

if missing_in_scraped:
    missing_scraped_df = local_whp[
        local_whp["Ref"]
        .astype(str)
        .str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
        .isin(missing_in_scraped)
    ]
    # missing_scraped_df.to_csv('./whp_missing_in_scraped.csv', index=False, encoding='utf-8-sig')
    print(f"\nSaved {len(missing_scraped_df):,} entries to whp_missing_in_scraped.csv")
if missing_in_local:
    # Ensure types match
    missing_local_df = scraped_whp[
        scraped_whp["node_id"].astype(str).isin(missing_in_local)
    ]
    print(f"After filtering: {len(missing_local_df)} rows")

    if len(missing_local_df) > 0:
        # missing_local_df.to_csv('./whp_missing_in_local.csv', index=False, encoding='utf-8-sig')
        print(f"Saved {len(missing_local_df):,} entries to whp_missing_in_local.csv")
        print(f"\nSample entries:")
        print(missing_local_df[["node_id", "headword"]].head(10))
    else:
        print("No matching rows found - type mismatch issue")


Entries in LOCAL but NOT in SCRAPED: 340
Entries in SCRAPED but NOT in LOCAL: 276

Saved 340 entries to whp_missing_in_scraped.csv
After filtering: 276 rows
Saved 276 entries to whp_missing_in_local.csv

Sample entries:
       node_id       headword
31466   211071   yecaxochitl.
31467   211072     texochitl.
31468   211073        teicui.
31469   211074  Itzcahuatzin.
31470   211075  Itzehecatzin.
31471   211076   Tlamatzinco.
31472   211077     amo ihual.
31473   211078    Techahuatl.
31474   211079     hualiloti.
31475   211080     Cualaztli.


In [119]:
# Compare translation_english Content (HTML stripped)


scraped_whp_copy = scraped_whp.copy()
local_whp_copy = local_whp.copy()
scraped_whp_copy["node_id"] = scraped_whp_copy["node_id"].astype(str)
local_whp_copy["Ref"] = (
    local_whp_copy["Ref"].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
)

merged = scraped_whp_copy.merge(
    local_whp_copy, left_on="node_id", right_on="Ref", how="inner"
)

merged["scraped_text"] = merged["translation_english"].apply(strip_html_better)
merged["local_text"] = merged["Principal English Translation"].apply(strip_html_better)
merged["content_differs"] = merged["scraped_text"] != merged["local_text"]

content_diff_count = merged["content_differs"].sum()

print(f"Rows compared: {len(merged):,}")
print(f"Actual content differences: {content_diff_count:,}")
print(f"HTML wrapper differences only: {len(merged) - content_diff_count:,}")

if content_diff_count > 0:

    content_diff_df = merged[merged["content_differs"]][
        ["node_id", "headword", "scraped_text", "local_text"]
    ]
    # content_diff_df.to_csv("./whp_translation_content_diffs.csv", index=False)
    print(f"\nSaved to whp_translation_content_diffs.csv")

Rows compared: 31,466
Actual content differences: 27,474
HTML wrapper differences only: 3,992

Saved to whp_translation_content_diffs.csv


In [120]:
# Apply Cross-Reference Reinsertion (LIVE RUN)

# Create working copy
local_whp_updated = local_whp.copy()
local_whp_updated["Ref_clean"] = (
    local_whp_updated["Ref"].astype(str).str.replace(r"^(WHP-|IDIEZ-)", "", regex=True)
)

stats = {"rows_updated": 0, "crossrefs_reinserted": 0, "successful": 0, "failed": 0}

for idx, local_row in local_whp_updated.iterrows():
    if idx % 5000 == 0:  # type: ignore
        print(f"Processing row {idx:,}...")

    node_id = local_row["Ref_clean"]
    crossrefs = local_row.get("Cross_References", "")

    if pd.isna(crossrefs) or str(crossrefs).strip() == "":
        continue

    scraped_row = scraped_whp[scraped_whp["node_id"].astype(str) == str(node_id)]
    if scraped_row.empty:
        continue

    scraped_row = scraped_row.iloc[0]
    positions = find_crossref_positions_in_scraped_improved(
        scraped_row["translation_english"]
    )

    if not positions:
        continue

    result_text = local_row["Principal English Translation"]
    successful = 0

    for pos in positions:
        result_text, success = reinsert_crossref_improved(
            result_text, pos["anchor_before"], pos["content"]
        )
        if success:
            successful += 1

    # Update the dataframe
    local_whp_updated.at[idx, "Principal English Translation"] = result_text  # type: ignore

    stats["rows_updated"] += 1
    stats["crossrefs_reinserted"] += len(positions)
    stats["successful"] += successful

print(f"Rows updated: {stats['rows_updated']:,}")
print(f"Cross-references reinserted: {stats['crossrefs_reinserted']:,}")
print(f"Successful insertions: {stats['successful']:,}")

# Save to CSV as checkpoint
local_whp_updated.to_csv(
    "./whp_with_crossrefs_reinserted.csv", index=False, encoding="utf-8-sig"
)
print(f"\nSaved to whp_with_crossrefs_reinserted.csv")

Processing row 0...
Processing row 5,000...
Processing row 10,000...
Processing row 15,000...
Processing row 20,000...
Processing row 25,000...
Processing row 30,000...
Rows updated: 16,687
Cross-references reinserted: 17,286
Successful insertions: 17,286

Saved to whp_with_crossrefs_reinserted.csv


In [121]:
def find_bibl_positions_with_context(text):
    """Find <bibl> tags with surrounding context for anchor matching"""
    if pd.isna(text) or text == "":
        return []

    pattern = re.compile(r"<bibl[^>]*>.*?</bibl>", re.DOTALL | re.IGNORECASE)
    positions = []

    for match in pattern.finditer(str(text)):
        full_tag = match.group(0)

        # Get context before the bibl tag (30 chars)
        before_start = max(0, match.start() - 50)
        before_text = text[before_start : match.start()]

        # Get context after the bibl tag (30 chars)
        after_end = min(len(text), match.end() + 50)
        after_text = text[match.end() : after_end]

        # Extract clean anchor words
        before_clean = strip_html_better(before_text)
        after_clean = strip_html_better(after_text)

        # Get last few words before as anchor
        words_before = before_clean.split()
        anchor_before = (
            " ".join(words_before[-5:])
            if len(words_before) >= 5
            else " ".join(words_before)
        )

        # Get first few words after as secondary anchor
        words_after = after_clean.split()
        anchor_after = (
            " ".join(words_after[:5])
            if len(words_after) >= 5
            else " ".join(words_after)
        )

        positions.append(
            {
                "full_tag": full_tag,
                "anchor_before": anchor_before,
                "anchor_after": anchor_after,
                "start": match.start(),
                "end": match.end(),
            }
        )

    return positions


def reinsert_bibl_with_anchor(local_text, anchor_before, anchor_after, bibl_tag):
    """Insert <bibl> tag using anchor matching"""
    if pd.isna(local_text):
        return local_text, False

    local_str = str(local_text)
    local_stripped = strip_html_better(local_str)

    # Try to find position using before anchor
    if anchor_before and anchor_before in local_stripped:
        # Find the last word of anchor in HTML text
        anchor_words = anchor_before.split()
        if anchor_words:
            last_word = anchor_words[-1].strip(".,;:!?")
            word_pos = local_str.rfind(last_word)

            if word_pos != -1:
                insertion_point = word_pos + len(last_word)
                reinserted = (
                    local_str[:insertion_point]
                    + f" {bibl_tag}"
                    + local_str[insertion_point:]
                )
                return reinserted, True

    # Try after anchor if before failed
    if anchor_after and anchor_after in local_stripped:
        # Find first word of after anchor
        anchor_words = anchor_after.split()
        if anchor_words:
            first_word = anchor_words[0].strip(".,;:!?")
            word_pos = local_str.find(first_word)

            if word_pos != -1:
                # Insert before this word
                reinserted = (
                    local_str[:word_pos] + f"{bibl_tag} " + local_str[word_pos:]
                )
                return reinserted, True

    # Fallback: append before closing tag or at end
    if "</p>" in local_str:
        insertion_point = local_str.rfind("</p>")
    else:
        insertion_point = len(local_str)

    reinserted = (
        local_str[:insertion_point] + f" {bibl_tag}" + local_str[insertion_point:]
    )
    return reinserted, True


AUTHORITY_MAPPING = {
    "Molina": "Alonso de Molina",
    "Karttunen": "Frances Karttunen",
    "Carochi": "Horacio Carochi / English",
    "Olmos": "Andrés de Olmos",
    "Lockhart": "Lockhart’s Nahuatl as Written",
}


def get_citations_for_node(node_id, scraped_db):
    """Get all citations for a node from scraped DB (by column) - UPDATED"""
    citations_by_column = {}

    # 1. Get translation_english bibl tags
    entry = pd.read_sql(
        f"""
        SELECT translation_english 
        FROM dictionary_entries 
        WHERE node_id = '{node_id}'
    """,
        scraped_db,
    )

    if not entry.empty and pd.notna(entry.iloc[0]["translation_english"]):
        bibl_tags = find_bibl_tags(entry.iloc[0]["translation_english"])
        if bibl_tags:
            citations_by_column["Principal English Translation"] = bibl_tags

    # 2. Get authority citations from authority_citations table
    auth_cites = pd.read_sql(
        f"""
        SELECT authority_name, citation_text
        FROM authority_citations
        WHERE node_id = '{node_id}'
        ORDER BY citation_order
    """,
        scraped_db,
    )

    for _, row in auth_cites.iterrows():
        auth_name = row["authority_name"]
        if auth_name in AUTHORITY_MAPPING:
            local_col = AUTHORITY_MAPPING[auth_name]
            bibl_tags = find_bibl_tags(row["citation_text"])
            if bibl_tags:
                if local_col not in citations_by_column:
                    citations_by_column[local_col] = []
                citations_by_column[local_col].extend(bibl_tags)

    # 3. Get attestations with bibl tags
    attestations = pd.read_sql(
        f"""
        SELECT language, attestation_text
        FROM attestations
        WHERE node_id = '{node_id}'
    """,
        scraped_db,
    )

    for _, row in attestations.iterrows():
        language = row["language"]
        attestation_text = row["attestation_text"]

        # Map to local column
        if language == "English":
            local_col = "Attestations from sources in English"
        elif language == "Spanish":
            local_col = "Attestations from sources in Spanish"
        else:
            continue

        bibl_tags = find_bibl_tags(attestation_text)
        if bibl_tags:
            if local_col not in citations_by_column:
                citations_by_column[local_col] = []
            citations_by_column[local_col].extend(bibl_tags)

    return citations_by_column

In [122]:
# Load the dataset with cross-refs already reinserted
# If you already ran the cross-ref reinsertion and have the CSV:
# local_whp_for_citations = pd.read_csv('./whp_with_crossrefs_reinserted.csv', encoding='utf-8-sig')

# OR if you want to use the in-memory version from earlier:
local_whp_for_citations = local_whp_updated.copy()

local_whp_for_citations['Ref_clean'] = local_whp_for_citations['Ref'].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True)

print(f"Loaded {len(local_whp_for_citations):,} rows")
print(f"Columns: {local_whp_for_citations.columns.tolist()[:10]}...")

Loaded 31,806 rows
Columns: ['Ref', 'Headword', 'Orthographic Variants', 'Principal English Translation', 'Attestations from sources in English', 'Attestations from sources in Spanish', 'Alonso de Molina', 'Frances Karttunen', 'Horacio Carochi / English', 'Andrés de Olmos']...


In [124]:
print("APPLYING CITATION REINSERTION (LIVE)")
local_whp_final = local_whp_for_citations.copy()

apply_stats = {"rows_updated": 0, "citations_inserted": 0}

for idx, local_row in local_whp_final.iterrows():
    if idx % 5000 == 0: #type: ignore
        print(f"Processing row {idx:,}...")

    node_id = local_row["Ref_clean"]
    citations_by_column = get_citations_for_node(node_id, scraped_db)

    if not citations_by_column:
        continue

    row_updated = False

    for col_name, bibl_list in citations_by_column.items():
        if col_name not in local_row.index:
            continue

        current_value = local_row[col_name]
        if pd.isna(current_value): #type: ignore
            current_value = ""

        # Get scraped text
        if col_name == "Principal English Translation":
            scraped_row = scraped_whp[scraped_whp["node_id"] == str(node_id)]
            if scraped_row.empty:
                continue
            scraped_text = scraped_row.iloc[0]["translation_english"]
            
        elif col_name in ["Attestations from sources in English", "Attestations from sources in Spanish"]:
            # Handle attestation columns
            language = "English" if "English" in col_name else "Spanish"
            attestation = pd.read_sql(f"""
                SELECT attestation_text 
                FROM attestations 
                WHERE node_id = '{node_id}' AND language = '{language}'
                LIMIT 1
            """, scraped_db)
            
            if attestation.empty:
                continue
            scraped_text = attestation.iloc[0]["attestation_text"]
        else:
            auth_name = [k for k, v in AUTHORITY_MAPPING.items() if v == col_name]
            if not auth_name:
                continue
            auth_cite = pd.read_sql(
                f"""
                SELECT citation_text 
                FROM authority_citations 
                WHERE node_id = '{node_id}' AND authority_name = '{auth_name[0]}'
                LIMIT 1
            """,
                scraped_db,
            )
            if auth_cite.empty:
                continue
            scraped_text = auth_cite.iloc[0]["citation_text"]

        positions = find_bibl_positions_with_context(scraped_text)
        if not positions:
            continue

        updated_value = str(current_value)
        for pos in positions:
            updated_value, _ = reinsert_bibl_with_anchor(
                updated_value,
                pos["anchor_before"],
                pos["anchor_after"],
                pos["full_tag"],
            )
            apply_stats["citations_inserted"] += 1

        local_whp_final.at[idx, col_name] = updated_value #type: ignore
        row_updated = True

    if row_updated:
        apply_stats["rows_updated"] += 1

print("CITATION REINSERTION COMPLETE")
print(f"Rows updated: {apply_stats['rows_updated']:,}")
print(f"Citations inserted: {apply_stats['citations_inserted']:,}")

# Save final dataset
local_whp_final.to_csv(
    "./whp_with_all_reinserted.csv", index=False, encoding="utf-8-sig"
)
print("\nSaved final dataset to whp_with_all_reinserted.csv")

APPLYING CITATION REINSERTION (LIVE)
Processing row 0...
Processing row 5,000...
Processing row 10,000...
Processing row 15,000...
Processing row 20,000...
Processing row 25,000...
Processing row 30,000...
CITATION REINSERTION COMPLETE
Rows updated: 30,135
Citations inserted: 50,657

Saved final dataset to whp_with_all_reinserted.csv


In [127]:
# ============================================================================
# VALIDATION: Verify Only Citations/Cross-Refs Changed
# ============================================================================

print("=" * 80)
print("VALIDATING CHANGES - Ensuring Only Citations/Cross-Refs Modified")
print("=" * 80)

# Load original and final datasets
original = pd.read_csv('./whp_with_crossrefs_reinserted.csv', encoding='utf-8-sig')  # Before citations
final = local_whp_final.copy()

print(f"\nOriginal rows: {len(original):,}")
print(f"Final rows: {len(final):,}")
print(f"Row count match: {len(original) == len(final)}")

# Columns that SHOULD have changed
expected_changed_columns = [
    'Principal English Translation',  # cross-refs + citations
    'Alonso de Molina',
    'Frances Karttunen',
    'Horacio Carochi / English',
    'Andrés de Olmos',
    "Lockhart’s Nahuatl as Written",
    'Attestations from sources in English',
    'Attestations from sources in Spanish'
]

# Check each column for changes
validation_results = {}

for col in original.columns:
    if col in ['Ref_clean']:  # Skip helper columns
        continue
    
    # Compare values
    original_values = original[col].fillna('').astype(str)
    final_values = final[col].fillna('').astype(str)
    
    changes = (original_values != final_values).sum()
    
    validation_results[col] = {
        'changes': changes,
        'expected': col in expected_changed_columns
    }

# Print results
print("\n" + "-" * 80)
print("COLUMN CHANGE ANALYSIS")
print("-" * 80)

print("\nExpected changes (citations/cross-refs):")
for col in expected_changed_columns:
    if col in validation_results:
        changes = validation_results[col]['changes']
        print(f"  {col}: {changes:,} rows changed")

print("\nUnexpected changes (should be 0):")
unexpected_changes = False
for col, info in validation_results.items():
    if not info['expected'] and info['changes'] > 0:
        print(f"  ⚠️  {col}: {info['changes']:,} rows changed (UNEXPECTED!)")
        unexpected_changes = True

if not unexpected_changes:
    print("  ✓ None - all other columns unchanged")

# Detailed check: verify content of unchanged columns
print("\n" + "-" * 80)
print("SPOT CHECK: Verifying unchanged columns")
print("-" * 80)

spot_check_cols = ['Headword', 'Orthographic Variants', 'Spanish Loanword']
all_match = True

for col in spot_check_cols:
    if col in original.columns and col in final.columns:
        matches = (original[col].fillna('') == final[col].fillna('')).all()
        print(f"  {col}: {'✓ All match' if matches else '✗ MISMATCH'}")
        if not matches:
            all_match = False
            # Show sample differences
            diffs = original[col].fillna('') != final[col].fillna('')
            print(f"    {diffs.sum()} differences found")

print("\n" + "=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)

if all_match and not unexpected_changes:
    print("✓ VALIDATION PASSED")
    print("  - Only expected columns changed")
    print("  - All other columns unchanged")
    print("  - Safe to update database")
else:
    print("⚠️  VALIDATION FAILED")
    print("  - Review unexpected changes before proceeding")

print("=" * 80)

VALIDATING CHANGES - Ensuring Only Citations/Cross-Refs Modified

Original rows: 31,806
Final rows: 31,806
Row count match: True

--------------------------------------------------------------------------------
COLUMN CHANGE ANALYSIS
--------------------------------------------------------------------------------

Expected changes (citations/cross-refs):
  Principal English Translation: 7,749 rows changed
  Alonso de Molina: 31,780 rows changed
  Frances Karttunen: 31,785 rows changed
  Horacio Carochi / English: 31,798 rows changed
  Andrés de Olmos: 31,803 rows changed
  Lockhart’s Nahuatl as Written: 31,458 rows changed
  Attestations from sources in English: 31,329 rows changed
  Attestations from sources in Spanish: 31,771 rows changed

Unexpected changes (should be 0):
  ✓ None - all other columns unchanged

--------------------------------------------------------------------------------
SPOT CHECK: Verifying unchanged columns
-----------------------------------------------------

In [129]:
# ============================================================================
# UPDATE DATABASE - Create New Checkpoint with Reinserted Data
# ============================================================================

print("=" * 80)
print("UPDATING nahuatl_processing.db WITH REINSERTED DATA")
print("=" * 80)

# Generate timestamp for new checkpoint
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d")

new_table_name = f"checkpoint_after_citation_crossref_reinsertion_{timestamp}"

# Drop Ref_clean helper column before saving
local_whp_final_clean = local_whp_final.drop(columns=['Ref_clean'], errors='ignore')

# Write to database
print("\nWriting to database...")
local_whp_final_clean.to_sql(
    new_table_name,
    local_db,
    if_exists='replace',
    index=False
)

print(f"✓ Created table: {new_table_name}")

# Verify the insert
verification = pd.read_sql(
    f"SELECT COUNT(*) as count FROM [{new_table_name}]",
    local_db
)

print(f"✓ Verified row count: {verification.iloc[0]['count']:,}")

# Show available checkpoints
tables = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'checkpoint%'",
    local_db
)

print(f"\nAvailable checkpoints in database:")
for table in tables['name']:
    row_count = pd.read_sql(f"SELECT COUNT(*) as count FROM [{table}]", local_db)
    print(f"  - {table}: {row_count.iloc[0]['count']:,} rows")

print("\n" + "=" * 80)
print("DATABASE UPDATE COMPLETE")
print("=" * 80)
print(f"✓ New checkpoint created: {new_table_name}")
print(f"✓ Original checkpoint preserved: {WHP_TABLE}")
print("=" * 80)

UPDATING nahuatl_processing.db WITH REINSERTED DATA

Writing to database...
✓ Created table: checkpoint_after_citation_crossref_reinsertion_20251030
✓ Verified row count: 31,806

Available checkpoints in database:
  - checkpoint_metadata: 93 rows
  - checkpoint_after_citation_and_crossref_extraction_20250922: 31,806 rows
  - checkpoint_initial_20250922: 31,806 rows
  - checkpoint_initial_20250929: 31,806 rows
  - checkpoint_removed_empty_p_tags_20250929: 31,806 rows
  - checkpoint_removed_empty_p_tags_20251002: 31,806 rows
  - checkpoint_after_lockhart_citation_extraction_20251002: 31,806 rows
  - checkpoint_after_empty_p_tag_removal_20251002: 31,806 rows
  - checkpoint_initial_20251002: 31,806 rows
  - checkpoint_initial_20251009: 38,652 rows
  - checkpoint_initial_20251010: 38,652 rows
  - checkpoint_after_citation_crossref_reinsertion_20251030: 31,806 rows

DATABASE UPDATE COMPLETE
✓ New checkpoint created: checkpoint_after_citation_crossref_reinsertion_20251030
✓ Original checkpoin

In [None]:
# scraped_db.close()
# local_db.close()
# print("Database connections closed")