# Validation and correction

As we've continued working with the scraped data versus the changes we've made from the initial csv files given to the LRC. We've done validation and corrections to the initial csv files that are not reflected in the scraped datasets. So now since we've uploaded our data into SQLite the next step will be to cross validate and update the SQLite. Which will be done below

In [151]:
## setup and imports
import os
import re
import csv
import sqlite3
import hashlib
import glob
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional, Union, Any
import logging
import pandas as pd
import numpy as np

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

note the original local datasets given to the LRC initially are hosted in the nahuatl_processing.db under sqLiteDb specifically under the table name: checkpoint_after_empty_p_tag_removal_20251002. For the scraped data, that is under scrapedDataDb under the nahuatl.db file. The schema is under config/ 

In [152]:
scraped_data_dir = Path("../../../data/scrapedDataDb/")
local_data_dir = Path("../../../data/sqLiteDb/")

if not scraped_data_dir.exists():
    raise FileNotFoundError(f"Scraped database not found: {scraped_data_dir}")
if not local_data_dir.exists():
    raise FileNotFoundError(f"Local database not found: {local_data_dir}")

# Database connection
scraped_db = sqlite3.connect(scraped_data_dir / "nahuatl.db")
local_db = sqlite3.connect(local_data_dir / "nahuatl_processing.db")


# read in the table(s), for the local_db it's only one table (actually two one for the WHP dataset and one for the IDIEZ dataset) while for the 
# scraped_db there are multiple tables due to the relationl structure we want to keep
tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", local_db)
WHP_TABLE = "checkpoint_after_empty_p_tag_removal_20251002"
IDIEZ_TABLE = "IDIEZ_modern_nahuatl-all-2024-03-27T09-45-31"

In [154]:
# Map scraped DB fields to local DB fields for WHP data
WHP_FIELD_MAPPING = {
    # Scraped field: Local field
    'node_id': 'Ref',
    'headword': 'Headword',
    'orthographic_variants': 'Orthographic Variants',
    'translation_english': 'Principal English Translation',
    'spanish_loanword': 'Spanish Loanword',
    # Authority fields (stored in authority_citations table in scraped DB)
    'authority_molina': 'Alonso de Molina',
    'authority_karttunen': 'Frances Karttunen',
    'authority_carochi': 'Horacio Carochi / English',
    'authority_olmos': 'Andrés de Olmos',
    'authority_lockhart': "Lockhart’s Nahuatl as Written",
    # Attestations (stored in attestations table in scraped DB)
    'attestations_english': 'Attestations from sources in English',
    'attestations_spanish': 'Attestations from sources in Spanish',
    # Metadata
    'themes': 'themes',
}

# Map scraped DB fields to local DB fields for IDIEZ data
IDIEZ_FIELD_MAPPING = {
    'node_id': 'Ref',
    'headword_idiez': 'tlahtolli',
    'translation_english_idiez': 'IDIEZ traduc. inglés',
    'definition_nahuatl_idiez': 'IDIEZ def. náhuatl',
    'definition_spanish_idiez': 'IDIEZ def. español',
    'morfologia_idiez': 'IDIEZ morfología',
    'gramatica_idiez': 'IDIEZ gramática',
}


In [155]:
# helper functions to compare data between scraped and local DBs
from turtle import right

def compare_dataframes(df_scraped, df_local, field_mapping, key_field='node_id'):
    # compares two dfs field by field
    scraped_key = field_mapping.get(key_field, key_field)
    df_scraped = df_scraped.copy()
    df_local = df_local.copy()
    df_scraped[key_field] = df_scraped[key_field].astype(str)
    df_local[scraped_key] = df_local[scraped_key].astype(str)
    df_local[scraped_key] = df_local[scraped_key].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True)
    # merge 
    merged = df_scraped.merge(
        df_local,
        left_on = key_field,
        right_on = scraped_key,
        how = 'inner',
        suffixes=('_scraped', '_local')
    )
    print(f"Rows in scraped DB: {len(df_scraped):,}")
    print(f"Rows in local DB: {len(df_local):,}")
    print(f"Rows matched: {len(merged):,}")
    
    discrepancies = {
        'field_discrepancies': {},
        'total_discrepancies': 0,
        'rows_compared': len(merged),
        'sample_discrepancies': []
    }
    # compare each field
    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue
        if scraped_field not in df_scraped.columns or local_field not in df_local.columns:
            print(f"Skipping {scraped_field} (not in both datasets)")
            continue
        scraped_col = f"{scraped_field}_scraped" if scraped_field in df_local.columns else scraped_field
        local_col = f"{local_field}_local" if local_field in df_scraped.columns else local_field
        
        # Find differences
        scraped_values = merged[scraped_col].fillna('').astype(str).str.strip()
        local_values = merged[local_col].fillna('').astype(str).str.strip()
        
        # Find differences
        merged['diff'] = ~(
            (scraped_values == local_values) |
            (merged[scraped_col].isna() & merged[local_col].isna())
        )
        diff_count = merged['diff'].sum()
        
        if diff_count > 0:
            print(f"{scraped_field:30} {diff_count:>6,} discrepancies")
            discrepancies['field_discrepancies'][scraped_field] = {
                'count': int(diff_count),
                'local_field': local_field,
                'sample_rows': merged[merged['diff']][key_field].head(10).tolist()
            }
            discrepancies['total_discrepancies'] += int(diff_count)
            
            if len(discrepancies['sample_discrepancies']) < 5:
                sample = merged[merged['diff']].iloc[0]
                discrepancies['sample_discrepancies'].append({
                    'node_id': sample[key_field],
                    'field': scraped_field,
                    'scraped_value': str(sample[scraped_col])[:100],
                    'local_value': str(sample[local_col])[:100]
                })
        else:
            print(f"{scraped_field:30} all match")
    
    print(f"Total discrepancies: {discrepancies['total_discrepancies']:,}")
    return discrepancies

def create_update_dataframe(scraped_df, local_df, field_mapping, key_field='node_id'):
    """Create dataframe showing what needs updating"""
    scraped_key = field_mapping.get(key_field, key_field)
    
    scraped_df = scraped_df.copy()
    local_df = local_df.copy()
    scraped_df[key_field] = scraped_df[key_field].astype(str)
    local_df[scraped_key] = local_df[scraped_key].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True)
    
    merged = scraped_df.merge(
        local_df,
        left_on=key_field,
        right_on=scraped_key,
        how='inner',
        suffixes=('_scraped', '_local')
    )
    
    updates = []
    
    for scraped_field, local_field in field_mapping.items():
        if scraped_field == key_field:
            continue
        
        if scraped_field not in scraped_df.columns or local_field not in local_df.columns:
            continue
        
        scraped_col = f"{scraped_field}_scraped" if scraped_field in local_df.columns else scraped_field
        local_col = f"{local_field}_local" if local_field in scraped_df.columns else local_field
        scraped_values = merged[scraped_col].fillna('').astype(str).str.strip()
        local_values = merged[local_col].fillna('').astype(str).str.strip()
        
        diff_mask = ~(
            (scraped_values == local_values) |
            (merged[scraped_col].isna() & merged[local_col].isna())
        )
        
        diff_rows = merged[diff_mask]
        
        for idx, row in diff_rows.iterrows():
            # Store STRIPPED values
            current_stripped = str(row[scraped_col]).strip() if pd.notna(row[scraped_col]) else ''
            new_stripped = str(row[local_col]).strip() if pd.notna(row[local_col]) else ''
            
            updates.append({
                'node_id': row[key_field],
                'field': scraped_field,
                'current_value': current_stripped,
                'new_value': new_stripped,
                'action': 'UPDATE'
            })
    
    return pd.DataFrame(updates)  

def apply_updates(updates_df, conn, table_name='dictionary_entries', dry_run=True):
    # apply updates to db
    print(f"APPLYING UPDATES TO {table_name}")
    print(f"Mode: {'DRY RUN' if dry_run else 'LIVE UPDATES'}")
    stats = {
        'total_updates': 0,
        'successful_updates': 0,
        'failed_updates': 0,
        'updates_by_field': defaultdict(int)
    }
    
    cursor = conn.cursor()
    grouped = updates_df.groupby('node_id')
    
    for node_id, group in grouped:
        try:
            set_clauses = []
            values = []
            
            for _, row in group.iterrows():
                set_clauses.append(f"{row['field']} = ?")
                values.append(row['new_value'])
                stats['updates_by_field'][row['field']] += 1
            
            values.append(node_id)
            sql = f"UPDATE {table_name} SET {', '.join(set_clauses)} WHERE node_id = ?"
            
            if not dry_run:
                cursor.execute(sql, values)
            
            stats['successful_updates'] += len(group)
            stats['total_updates'] += len(group)
            
        except Exception as e:
            print(f"Error updating node_id {node_id}: {e}")
            stats['failed_updates'] += len(group)
            stats['total_updates'] += len(group)
    
    if not dry_run:
        conn.commit()
        print("Changes committed")
    else:
        print("Dry run complete - no changes made")
    
    print(f"\nStatistics:")
    print(f"  Total updates: {stats['total_updates']:,}")
    print(f"  Successful: {stats['successful_updates']:,}")
    print(f"  Failed: {stats['failed_updates']:,}")
    print(f"\nUpdates by field:")
    for field, count in stats['updates_by_field'].items():
        print(f"  - {field}: {count:,}")
    
    return stats


for me it methodically makes sense to go down the tables that nahuat.db has (check config/schema.sql) and then check each one against the WHP_table_name and the IDIEZ_table_name as such let's begin with the largest tables first and build up from the tables. 
it would also be smart that as we cross validate we proceed to investigate where the cross references columns actually come from
also a side note since we've done no manual corrections or fixes to the IDIEZ fields we can begin with validating local IDIEZ with scraped IDIEZ data

In [145]:
scraped_idiez = pd.read_sql("""
    SELECT 
        node_id,
        headword_idiez,
        translation_english_idiez,
        definition_nahuatl_idiez,
        definition_spanish_idiez,
        morfologia_idiez,
        gramatica_idiez,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset IN ('IDIEZ', 'HYBRID')
""", scraped_db)

print(f"Loaded {len(scraped_idiez):,} IDIEZ/HYBRID entries from scraped DB")

# Load IDIEZ from local DB
local_idiez = pd.read_sql(f"""
    SELECT 
        Ref,
        tlahtolli,
        "IDIEZ traduc. inglés",
        "IDIEZ def. náhuatl",
        "IDIEZ def. español",
        "IDIEZ morfología",
        "IDIEZ gramática"
    FROM [{IDIEZ_TABLE}]
""", local_db)

print(f"Loaded {len(local_idiez):,} IDIEZ entries from local DB")

# Compare
idiez_discrepancies = compare_dataframes(
    scraped_idiez,
    local_idiez,
    IDIEZ_FIELD_MAPPING,
    key_field='node_id'
)

# Show samples
if idiez_discrepancies['sample_discrepancies']:
    print("\nSample IDIEZ discrepancies:")
    for i, sample in enumerate(idiez_discrepancies['sample_discrepancies'][:5], 1):
        print(f"\n{i}. node_id={sample['node_id']}, field={sample['field']}")
        print(f"   Scraped: {sample['scraped_value']}")
        print(f"   Local:   {sample['local_value']}")

Loaded 6,846 IDIEZ/HYBRID entries from scraped DB
Loaded 6,846 IDIEZ entries from local DB
Rows in scraped DB: 6,846
Rows in local DB: 6,846
Rows matched: 6,844
headword_idiez                 all match
translation_english_idiez          11 discrepancies
definition_nahuatl_idiez           24 discrepancies
definition_spanish_idiez          425 discrepancies
morfologia_idiez               all match
gramatica_idiez                all match
Total discrepancies: 460

Sample IDIEZ discrepancies:

1. node_id=187252, field=translation_english_idiez
   Scraped: 
   Local:   to transport rocks.

2. node_id=176130, field=definition_nahuatl_idiez
   Scraped: ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl
   Local:   ICPATL. tlat. Tlamalintli iloh, iixnezca chipahuac quitequihuah ica quichihchihuah cantelah o ica tl

3. node_id=172003, field=definition_spanish_idiez
   Scraped: A.1. se enfrìa. “Se enfría la tierra de noche. 2. Se va la luz. “Cuan

In [None]:
# creating the IDIEZ report
if idiez_discrepancies['total_discrepancies'] > 0:
    print("\nCreating IDIEZ update report...")
    
    idiez_updates = create_update_dataframe(
        scraped_idiez,
        local_idiez,
        IDIEZ_FIELD_MAPPING
    )
    
    print(f"Total IDIEZ updates needed: {len(idiez_updates):,}")
    print("\nUpdates by field:")
    print(idiez_updates['field'].value_counts())
    
    # idiez_updates.to_csv('idiez_updates_needed.csv', index=False, encoding='utf-8-sig')
    print("\nUpdate report saved to: idiez_updates_needed.csv")
else:
    print("\nNo IDIEZ updates needed - data matches perfectly!")
    idiez_updates = pd.DataFrame()


Creating IDIEZ update report...
Total IDIEZ updates needed: 460

Updates by field:
field
definition_spanish_idiez     425
definition_nahuatl_idiez      24
translation_english_idiez     11
Name: count, dtype: int64

Update report saved to: idiez_updates_needed.csv


In [159]:
scraped_whp = pd.read_sql("""
    SELECT 
        node_id,
        headword,
        orthographic_variants,
        translation_english,
        spanish_loanword,
        source_dataset
    FROM dictionary_entries
    WHERE source_dataset = 'WHP'
""", scraped_db)
print(f"Loaded {len(scraped_whp):,} WHP entries from scraped DB")

local_whp = pd.read_sql(f"""
    SELECT 
        Ref,
        Headword,
        "Orthographic Variants",
        "Principal English Translation",
        "Spanish Loanword"
    FROM [{WHP_TABLE}]
""", local_db)
print(f"Loaded {len(local_whp):,} WHP entries from local DB")
print("COMPARING WHP DATA")

whp_field_mapping = {
    'node_id': 'Ref',
    'headword': 'Headword',
    'orthographic_variants': 'Orthographic Variants',
    'translation_english': 'Principal English Translation',
    'spanish_loanword': 'Spanish Loanword',
}

whp_discrepancies = compare_dataframes(
    scraped_whp,
    local_whp,
    whp_field_mapping,
    key_field='node_id'
)

if whp_discrepancies['sample_discrepancies']:
    print("\nSample WHP discrepancies:")
    for i, sample in enumerate(whp_discrepancies['sample_discrepancies'][:5], 1):
        print(f"\n{i}. node_id={sample['node_id']}, field={sample['field']}")
        print(f"   Scraped: {sample['scraped_value']}")
        print(f"   Local:   {sample['local_value']}")

print("WHP COMPARISON COMPLETE")
print(f"Total discrepancies found: {whp_discrepancies['total_discrepancies']:,}")
print(f"Fields with discrepancies: {len(whp_discrepancies['field_discrepancies'])}")

Loaded 31,742 WHP entries from scraped DB
Loaded 31,806 WHP entries from local DB
COMPARING WHP DATA
Rows in scraped DB: 31,742
Rows in local DB: 31,806
Rows matched: 31,466
headword                           76 discrepancies
orthographic_variants             187 discrepancies
translation_english            31,466 discrepancies
spanish_loanword               31,466 discrepancies
Total discrepancies: 63,195

Sample WHP discrepancies:

1. node_id=172881, field=headword
   Scraped: -icxitlan.
   Local:   -icxtlan.

2. node_id=171987, field=orthographic_variants
   Scraped: canah
   Local:   None

3. node_id=171879, field=translation_english
   Scraped: <div class="field-item even"><p>perhaps not (adverb) (see Molina)</p>
</div>
   Local:   <p>perhaps not (adverb)</p>

4. node_id=171879, field=spanish_loanword
   Scraped: 
   Local:   No
WHP COMPARISON COMPLETE
Total discrepancies found: 63,195
Fields with discrepancies: 4


In [None]:
# Prepare node_id sets for comparison
scraped_ids = set(scraped_whp['node_id'].astype(str))
local_ids = set(local_whp['Ref'].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True))

# Find missing entries
missing_in_scraped = local_ids - scraped_ids
missing_in_local = scraped_ids - local_ids

print(f"\nEntries in LOCAL but NOT in SCRAPED: {len(missing_in_scraped):,}")
print(f"Entries in SCRAPED but NOT in LOCAL: {len(missing_in_local):,}")

if missing_in_scraped:
    missing_scraped_df = local_whp[local_whp['Ref'].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True).isin(missing_in_scraped)]
    missing_scraped_df.to_csv('./whp_missing_in_scraped.csv', index=False, encoding='utf-8-sig')
    print(f"\nSaved {len(missing_scraped_df):,} entries to whp_missing_in_scraped.csv")

if missing_in_local:
    # Ensure types match
    missing_local_df = scraped_whp[scraped_whp['node_id'].astype(str).isin(missing_in_local)]
    print(f"After filtering: {len(missing_local_df)} rows")
    
    if len(missing_local_df) > 0:
        missing_local_df.to_csv('./whp_missing_in_local.csv', index=False, encoding='utf-8-sig')
        print(f"Saved {len(missing_local_df):,} entries to whp_missing_in_local.csv")
        print(f"\nSample entries:")
        print(missing_local_df[['node_id', 'headword']].head(10))
    else:
        print("No matching rows found - type mismatch issue")
        


Entries in LOCAL but NOT in SCRAPED: 340
Entries in SCRAPED but NOT in LOCAL: 276

Saved 340 entries to whp_missing_in_scraped.csv
After filtering: 276 rows
Saved 276 entries to whp_missing_in_local.csv

Sample entries:
       node_id       headword
31466   211071   yecaxochitl.
31467   211072     texochitl.
31468   211073        teicui.
31469   211074  Itzcahuatzin.
31470   211075  Itzehecatzin.
31471   211076   Tlamatzinco.
31472   211077     amo ihual.
31473   211078    Techahuatl.
31474   211079     hualiloti.
31475   211080     Cualaztli.

INVESTIGATION COMPLETE


In [None]:
# Compare translation_english Content (HTML stripped)

def strip_html(text):
    if pd.isna(text) or text == '':
        return ''
    text = re.sub(r'<[^>]+>', '', str(text))
    text = ' '.join(text.split())
    return text.strip()

scraped_whp_copy = scraped_whp.copy()
local_whp_copy = local_whp.copy()
scraped_whp_copy['node_id'] = scraped_whp_copy['node_id'].astype(str)
local_whp_copy['Ref'] = local_whp_copy['Ref'].astype(str).str.replace(r'^(WHP-|IDIEZ-)', '', regex=True)

merged = scraped_whp_copy.merge(
    local_whp_copy,
    left_on='node_id',
    right_on='Ref',
    how='inner'
)

merged['scraped_text'] = merged['translation_english'].apply(strip_html)
merged['local_text'] = merged['Principal English Translation'].apply(strip_html)
merged['content_differs'] = merged['scraped_text'] != merged['local_text']

content_diff_count = merged['content_differs'].sum()

print(f"Rows compared: {len(merged):,}")
print(f"Actual content differences: {content_diff_count:,}")
print(f"HTML wrapper differences only: {len(merged) - content_diff_count:,}")

if content_diff_count > 0:
    content_diffs = merged[merged['content_differs']].head(20)
    for idx, row in content_diffs.iterrows():
        print(f"\nNode ID: {row['node_id']} - {row['headword']}")
        print(f"  Scraped: {row['scraped_text'][:150]}")
        print(f"  Local:   {row['local_text'][:150]}")
    
    content_diff_df = merged[merged['content_differs']][['node_id', 'headword', 'scraped_text', 'local_text']]
    content_diff_df.to_csv('./whp_translation_content_diffs.csv', index=False)
    print(f"\nSaved to whp_translation_content_diffs.csv")

Rows compared: 31,466
Actual content differences: 27,474
HTML wrapper differences only: 3,992

Node ID: 171879 - acazomo.
  Scraped: perhaps not (adverb) (see Molina)
  Local:   perhaps not (adverb)

Node ID: 171881 - ayac.
  Scraped: no one; nobody; or, for someone to be absent (see Lockhart)
  Local:   no one; nobody; or, for someone to be absent

Node ID: 171882 - acan.
  Scraped: nowhere, no place (see Molina, Karttunen, Lockhart, etc.)
  Local:   nowhere, no place

Node ID: 171884 - acatto.
  Scraped: first (see Karttunen)
  Local:   first

Node ID: 171885 - achi.
  Scraped: a little, a small amount; or, somehow (see Molina; and see our entry for achitzin)
  Local:   a little, a small amount; or, somehow

Node ID: 171886 - achitzin.
  Scraped: a bit, a little; not much (see also our entry for achi)
  Local:   a bit, a little; not much

Node ID: 171888 - achica.
  Scraped: often, frequently (see Molina); a little bit, a little bit of time; shortly; a little bit more
  Local:   ofte

In [148]:
scraped_db.close()
local_db.close()
print("Database connections closed")

Database connections closed
