# Nahuatl Notebook for the WHP_EarlyNahuatl_Dataset

This notebook processes Nahuatl dictionary data, analyzing HTML tags, repairing malformed tags, and extracting citations and cross-references. This is a merged version of Todd's version and I where there is a SQLite-based data management approach.

## Step 1: Setup and Imports

In [59]:
import os
import re
import csv
import sqlite3
import hashlib
import glob
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional, Union, Any

# Data processing
import pandas as pd
import numpy as np

# Character encoding detection
import chardet
from unidecode import unidecode

# HTML/XML processing
from bs4 import BeautifulSoup
from inscriptis import get_text

In [60]:
# Configuration constants
WORKING_DIR = 'working_files'
DB_PATH = '../../data/sqLiteDb/nahuatl_processing.db'  # Single database for all operations
DEFAULT_ENCODING = 'utf-8-sig'  # This encoding is the encoding that works when outputing to CSV files to encode certain Nahuatl characters properly.
CHECKPOINT_STAGES = ['initial', 'cleaned', 'final']  # Only save these to SQLite

## Step 2: Data Import and Working Copy Creation

In [61]:
class DataLoader:
    """Unified data loader for CSV and SQLite sources"""
    
    def __init__(self, encoding: str = DEFAULT_ENCODING):
        self.encoding = encoding
        
    def detect_encoding(self, filepath: str) -> str:
        """Auto-detect file encoding if needed"""
        with open(filepath, 'rb') as file:
            raw_data = file.read(10000)  # Read first 10KB
            result = chardet.detect(raw_data)
            return result['encoding'] or 'utf-8'
    
    def load_from_csv(self, filepath: str, auto_detect_encoding: bool = False) -> pd.DataFrame:
        """Load data from CSV with encoding protection"""
        
        # Detect encoding if requested
        encoding = self.detect_encoding(filepath) if auto_detect_encoding else self.encoding
        
        print(f"Loading data from CSV: {filepath}")
        print(f"Encoding: {encoding}")
        
        try:
            df = pd.read_csv(
                filepath,
                encoding=encoding,
                na_values=[''],
                keep_default_na=True,
                dtype=str,  # Keep everything as strings initially
                low_memory=False
            )
            
            print(f"Successfully loaded {len(df):,} rows × {len(df.columns)} columns")
            print(f"Columns: {list(df.columns)[:5]}..." if len(df.columns) > 5 else f"   Columns: {list(df.columns)}")
            
            return df
            
        except UnicodeDecodeError as e:
            print(f"Encoding error: {e}")
            print("Attempting with auto-detected encoding...")
            return self.load_from_csv(filepath, auto_detect_encoding=True)
    
    def load_from_sqlite(self, db_path: str, table_name: str) -> pd.DataFrame:
        """Load data from SQLite database"""
        
        print(f"Loading data from SQLite: {db_path}")
        print(f"Table: {table_name}")
        
        with sqlite3.connect(db_path) as conn:
            # Check if table exists
            tables = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)
            if table_name not in tables['name'].values:
                raise ValueError(f"Table '{table_name}' not found in database")
            
            df = pd.read_sql(f"SELECT * FROM [{table_name}]", conn)
            
        print(f"Successfully loaded {len(df):,} rows × {len(df.columns)} columns")
        
        return df
    
    
    def create_working_copy(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Create a working copy while preserving the original"""
        original_df = df.copy(deep=True)
        working_df = df.copy(deep=True)
        
        print(f"Created working copy of data")
        
        return original_df, working_df

## Step 3: Intermediate Save Functions

In [62]:
class DataSaver:
    """Handles saving data at various stages with encoding protection"""
    
    def __init__(self, working_dir: str = WORKING_DIR, encoding: str = DEFAULT_ENCODING):
        self.working_dir = working_dir
        self.encoding = encoding
        os.makedirs(working_dir, exist_ok=True)
        
    def save_to_csv(self, df: pd.DataFrame, filename: str, add_timestamp: bool = True) -> str:
        """Save DataFrame to CSV with encoding protection"""
        
        if add_timestamp:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            base_name = filename.replace('.csv', '')
            filename = f"{base_name}_{timestamp}.csv"
        
        filepath = os.path.join(self.working_dir, filename)
        
        df.to_csv(
            filepath,
            index=False,
            encoding=self.encoding,
            na_rep='',  # Explicit NA representation
            quoting=csv.QUOTE_MINIMAL
        )
        
        print(f"Saved to CSV: {filepath}")
        return filepath
    
    def save_checkpoint_to_sqlite(self, df: pd.DataFrame, checkpoint_name: str, conn: sqlite3.Connection) -> None:
        """Save checkpoint to SQLite (only for critical stages)"""
        
        table_name = f"checkpoint_{checkpoint_name}_{datetime.now().strftime('%Y%m%d')}"
        
        df.to_sql(
            table_name,
            conn,
            if_exists='replace',
            index=False,
            dtype='text'  # Store everything as text to preserve formatting
        )
        
        print(f"Checkpoint saved to SQLite: {table_name}")
        
        # Also save metadata
        metadata = pd.DataFrame([{
            'checkpoint_name': checkpoint_name,
            'table_name': table_name,
            'timestamp': datetime.now().isoformat(),
            'row_count': len(df),
            'column_count': len(df.columns),
            'columns': ','.join(df.columns)
        }])
        
        metadata.to_sql(
            'checkpoint_metadata',
            conn,
            if_exists='append',
            index=False
        )
    
    def save_to_excel(self, data_dict: Dict[str, pd.DataFrame], filename: str) -> str:
        """Save multiple DataFrames to Excel file"""
        
        filepath = os.path.join(self.working_dir, filename)
        
        with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
            for sheet_name, df in data_dict.items():
                # Excel sheet name limit is 31 characters
                clean_sheet_name = sheet_name[:31] if len(sheet_name) > 31 else sheet_name
                df.to_excel(writer, sheet_name=clean_sheet_name, index=False)
        
        print(f"Saved to Excel: {filepath}")
        return filepath

## Step 4: Database Initialization and Setup

In [63]:
class DatabaseManager:
    """Manages SQLite database connection and operations"""
    
    def __init__(self, db_path: str = DB_PATH):
        self.db_path = db_path
        self.conn = None
        self._initialize_database()
    
    def _initialize_database(self):
        """Initialize database and create necessary tables"""
        
        # Create directory if it doesn't exist
        db_dir = os.path.dirname(self.db_path)
        if db_dir:
            os.makedirs(db_dir, exist_ok=True)
        
        # Connect to database
        self.conn = sqlite3.connect(self.db_path)
        
        print(f"Database initialized: {self.db_path}")
        
        # Create metadata tables if they don't exist
        self._create_metadata_tables()
        
        # Show existing tables
        self._show_database_info()
    
    def _create_metadata_tables(self):
        """Create metadata tables for tracking processing"""
        
        # Checkpoint metadata table
        if self.conn is None:
            raise ValueError("Database connection is not initialized.")
        
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS checkpoint_metadata (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            checkpoint_name TEXT,
            table_name TEXT,
            timestamp TEXT,
            row_count INTEGER,
            column_count INTEGER,
            columns TEXT
            )
        """)
        
        # Processing log table
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS processing_log (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                stage TEXT,
                status TEXT,
                timestamp TEXT,
                duration_seconds REAL,
                rows_processed INTEGER,
                notes TEXT
            )
        """)
        
        self.conn.commit()
    
    
    
    def import_initial_dataset(self, csv_path: str, table_name: str = 'WHP_EarlyNahuatl_Data', 
                              encoding: str = DEFAULT_ENCODING, replace: bool = False) -> bool:
        """
        Import initial CSV dataset into SQLite as the base reference.
        This is a ONE-TIME operation to establish the source data in the database.
        """
        
        print("\n" + "=" * 70)
        print("INITIAL DATASET IMPORT TO SQLITE")
        print("=" * 70)
        
        # Check if table already exists
        existing_tables = pd.read_sql(
            "SELECT name FROM sqlite_master WHERE type='table'",
            self.conn
        )
        
        if table_name in existing_tables['name'].values:
            if not replace:
                print(f"Table '{table_name}' already exists!")
                response = input("Do you want to replace it? (yes/no): ").lower()
                if response != 'yes':
                    print("Import cancelled")
                    return False
            else:
                print(f" Replacing existing table '{table_name}'")
        
        try:
            # Load CSV with encoding protection
            print(f"Reading CSV file: {csv_path}")
            print(f"   Encoding: {encoding}")
            
            # First, detect actual encoding if needed
            with open(csv_path, 'rb') as file:
                raw_data = file.read(10000)
                detected = chardet.detect(raw_data)
                print(f"   Detected encoding: {detected['encoding']} (confidence: {detected['confidence']:.2%})")
            
            # Load the CSV
            df = pd.read_csv(
                csv_path,
                encoding=encoding,
                na_values=[''],
                keep_default_na=True,
                dtype=str,  # Import everything as text to preserve formatting
                low_memory=False
            )
            
            print(f"Loaded {len(df):,} rows × {len(df.columns)} columns")
            
            # Show sample of data
            print("\nData Sample (first 3 rows):")
            print(df.head(3).to_string(max_cols=5))
            
            # Import to SQLite
            print(f"\nImporting to SQLite table: {table_name}")
            
            df.to_sql(
                table_name,
                self.conn,
                if_exists='replace' if replace else 'fail',
                index=False,
                dtype='text',  # Store as text to preserve all characters
                chunksize=5000  # Process in chunks for large datasets
            )
            
            # Create indexes for common query columns
            print("Creating indexes...")
            
            if self.conn is None:
                raise ValueError("Database connection is not initialized.")
        
            # Add index on Ref column if it exists
            if 'Ref' in df.columns:
                self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{table_name}_ref ON {table_name}(Ref)")
            
            # Add index on Headword column if it exists  
            if 'Headword' in df.columns:
                self.conn.execute(f"CREATE INDEX IF NOT EXISTS idx_{table_name}_headword ON {table_name}(Headword)")
            
            self.conn.commit()
            
            # Verify import
            verify_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", self.conn).iloc[0, 0]
            
            if verify_count == len(df):
                print(f"Successfully imported {verify_count:,} rows")
                
                # Log the import
                import_metadata = pd.DataFrame([{
                    'operation': 'initial_import',
                    'source_file': csv_path,
                    'table_name': table_name,
                    'timestamp': datetime.now().isoformat(),
                    'row_count': verify_count,
                    'column_count': len(df.columns),
                    'columns': ','.join(df.columns),
                    'encoding_used': encoding
                }])
                
                import_metadata.to_sql(
                    'import_history',
                    self.conn,
                    if_exists='append',
                    index=False
                )
                
                print("Import logged to metadata")
                return True
            else:
                print(f"Row count mismatch! Expected {len(df)}, got {verify_count}")
                return False
                
        except Exception as e:
            print(f"Import failed: {e}")
            return False
        
        
    def _show_database_info(self):
        """Display information about the database"""
        
        tables = pd.read_sql(
            "SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view')",
            self.conn
        )
        
        if not tables.empty:
            print(f"Existing database objects: {len(tables)}")
            for _, row in tables.iterrows():
                # Get row count for tables
                if row['type'] == 'table':
                    try:
                        count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM [{row['name']}]", self.conn).iloc[0, 0]
                        print(f"   - {row['name']} ({row['type']}): {count:,} rows")
                    except:
                        print(f"   - {row['name']} ({row['type']})")
        else:
            print("No existing tables in database")
    
    def log_processing_stage(self, stage: str, status: str, 
                        duration: Optional[float] = None, 
                        rows_processed: Optional[int] = None, 
                        notes: Optional[str] = None):
        """Log processing stage to database"""
        
        log_entry = pd.DataFrame([{
            'stage': stage,
            'status': status,
            'timestamp': datetime.now().isoformat(),
            'duration_seconds': duration,
            'rows_processed': rows_processed,
            'notes': notes
        }])
        
        log_entry.to_sql('processing_log', self.conn, if_exists='append', index=False)
    
    
    def verify_base_dataset(self, table_name: str = 'WHP_EarlyNahuatl_Data') -> bool:
        """Verify that base dataset exists and is valid"""
        
        try:
            # Check if table exists
            tables = pd.read_sql(
                f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'",
                self.conn
            )
            
            if tables.empty:
                print(f"Base dataset table '{table_name}' not found")
                return False
            
            # Get basic info
            count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", self.conn).iloc[0, 0]
            cols = pd.read_sql(f"PRAGMA table_info({table_name})", self.conn)
            
            print(f"   Base dataset verified:")
            print(f"   Table: {table_name}")
            print(f"   Rows: {count:,}")
            print(f"   Columns: {len(cols)}")
            
            return True
            
        except Exception as e:
            print(f"Verification failed: {e}")
            return False
        
        
    def get_checkpoint_list(self) -> pd.DataFrame:
        """Get list of available checkpoints"""
        
        try:
            checkpoints = pd.read_sql(
                "SELECT * FROM checkpoint_metadata ORDER BY timestamp DESC",
                self.conn
            )
            return checkpoints
        except:
            return pd.DataFrame()
    
    def load_checkpoint(self, checkpoint_name: Optional[str] = None, 
                   table_name: Optional[str] = None) -> pd.DataFrame:
        """Load a specific checkpoint"""
        
        if table_name:
            return pd.read_sql(f"SELECT * FROM [{table_name}]", self.conn)
        elif checkpoint_name:
            # Get most recent checkpoint with this name
            query = """
                SELECT table_name 
                FROM checkpoint_metadata 
                WHERE checkpoint_name = ? 
                ORDER BY timestamp DESC 
                LIMIT 1
            """
            result = pd.read_sql(query, self.conn, params=[checkpoint_name])
            if not result.empty:
                table_name = str(result.iloc[0, 0])
                return pd.read_sql(f"SELECT * FROM [{table_name}]", self.conn)
        
        raise ValueError("No checkpoint found")
    
    def close(self):
        """Close database connection"""
        if self.conn:
            self.conn.close()
            print("Database connection closed")

## Step 5: HTML Tag Handler (Detection + Repair)

In [64]:
class HTMLTagHandler:
    """Handles HTML tag detection and repair"""
    
    def __init__(self):
        # Valid HTML tags
        self.valid_tags = {
            'p', 'br', 'div', 'span', 'a', 'b', 'i', 'u', 'strong', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table',
            'tr', 'td', 'th', 'img', 'link', 'meta', 'head', 'body', 'html',
            'bibl', 'title', 'sup', 'sub', 'del'
        }
        
        # Known malformations and their fixes
        self.malformed_fixes = {
            '</p</bibl>': '</p></bibl>',       
            '<bibl<': '<bibl>',
            '</bibbl>': '</bibl>',
            '<bibbl>': '<bibl>',
            '<bobl>': '<bibl>',
            '</bobl>': '</bibl>',
            '<b9bl>': '<bibl>',
            '<bibi>': '<bibl>',
            '<bibl></p>': '</bibl></p>',
        }
    
    def find_all_tags(self, text: str) -> List[str]:
        """Find all angle bracket patterns"""
        if pd.isna(text) or not text:
            return []
        return re.findall(r'</?[^<>]+/?>', str(text))
    
    def classify_tag(self, tag: str) -> Tuple[str, Optional[str]]:
        """
        Classify a tag as: valid_html, malformed_html, or non_html
        Returns: (tag_type, suggested_fix)
        """
        # Check if it's a known malformation
        if tag in self.malformed_fixes:
            return 'malformed_html', self.malformed_fixes[tag]
        
        # Extract tag name
        match = re.match(r'^</?([^>\s/]+)', tag)
        if match:
            tag_name = match.group(1).lower()
            if tag_name in self.valid_tags:
                return 'valid_html', None
            
            # Check if it looks like HTML but misspelled
            if any(valid in tag_name for valid in ['bibl', 'p', 'br', 'div']):
                # Suggest a fix based on what it looks like
                if 'bibl' in tag_name:
                    return 'malformed_html', '</bibl>' if tag.startswith('</') else '<bibl>'
                return 'malformed_html', tag  # Can't auto-fix
        
        # Not HTML at all
        return 'non_html', None
    
    def repair_text(self, text: str) -> Tuple[str, int]:
        """
        Repair all malformed HTML tags in text
        Returns: (repaired_text, number_of_repairs)
        """
        if pd.isna(text) or not text:
            return text, 0
        
        text_str = str(text)
        repairs_made = 0
        
        # Apply known fixes
        for malformed, fix in self.malformed_fixes.items():
            if malformed in text_str:
                count = text_str.count(malformed)
                text_str = text_str.replace(malformed, fix)
                repairs_made += count
        
        # Apply pattern-based fixes
        text_str = re.sub(r'<<(\w+)>', r'<\1>', text_str)  # Double opening
        text_str = re.sub(r'<(\w+)>>', r'<\1>', text_str)  # Double closing
        
        return text_str, repairs_made
    
    def encode_non_html(self, text: str) -> str:
        """Encode non-HTML angle brackets"""
        if pd.isna(text) or not text:
            return text
        
        text_str = str(text)
        tags = self.find_all_tags(text_str)
        
        for tag in tags:
            tag_type, _ = self.classify_tag(tag)
            if tag_type == 'non_html':
                encoded = tag.replace('<', '&lt;').replace('>', '&gt;')
                text_str = text_str.replace(tag, encoded)
        
        return text_str

### Step 5.1 HTML Malformed Tag Repair

In [65]:
class MalformedTagRepairer:
    """Specialized handler for malformed HTML tag repair"""
    
    def __init__(self):
        self.html_analyzer = HTMLTagHandler() 
        self.html_tags = {
            'p', 'br', 'div', 'span', 'a', 'b', 'i', 'u', 'strong', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table',
            'tr', 'td', 'th', 'img', 'bibl', 'title', 'sup', 'sub', 'del'
        }
        self.self_closing_tags = {'br', 'hr', 'img', 'input', 'meta', 'link'}
        # Specific patterns for actual HTML malformations
        self.html_malformation_patterns = {
            '</bibbl>': '</bibl>',
            '<bibbl>': '<bibl>',
            '</bobl>': '</bibl>',
            '<bobl>': '<bibl>',
            '<b9bl>': '<bibl>',
            '<bibi>': '<bibl>',
            '</p</bibl>': '</p></bibl>',
            '<p<': '<p>',
            '</p>p>': '</p>',
        }
        self.auto_repairable_tags = {'bibl'}    
        self.tag_closing_heuristics = {
            'bibl': [
                r'</p>',           # End at paragraph close
                r'\n\s*<p',        # Before new paragraph  
                r'\.\s*(?=<|$)',   # After period at end of sentence
                r'$'               # End of text as fallback
            ]
        }
        self.regex_patterns = {
            r'<<(\w+)>': r'<\1>',        # Double opening
            r'<(\w+)>>': r'<\1>',        # Double closing
        }
        
    def detect_mismatches(self, text: str) -> list[dict[str, str]]:
        """Detect tag count mismatches"""
        if pd.isna(text) or text == '':
            return []
        
        mismatches = []
        text_str = str(text)
        
        for tag_name in self.html_tags:
            if tag_name in self.self_closing_tags:
                continue
            
            # Count opening and closing tags
            open_pattern = f"<{tag_name}(?:\\s+[^>]*)?>"
            close_pattern = f'</{tag_name}>'
            
            open_count = len(re.findall(open_pattern, text_str, re.IGNORECASE))
            close_count = len(re.findall(close_pattern, text_str, re.IGNORECASE))
            
            if open_count != close_count:
                mismatches.append({
                    'tag': tag_name,
                    'open_count': open_count,
                    'close_count': close_count,
                    'issue': f'Mismatch: {open_count} open, {close_count} closed'
                })
        
        return mismatches
    
    
    def analyze_malformed_tags(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Analyze both malformed patterns AND mismatches"""
        from collections import Counter
        
        results = {
            'malformed_by_row': [],
            'malformed_summary': []
        }
        
        for idx, row in df.iterrows():
            for col in df.columns:
                cell_value = row[col]
                if pd.notna(cell_value) and cell_value != '':
                    text_str = str(cell_value)
                    
                    # Check for literal malformed patterns
                    for pattern, replacement in self.html_malformation_patterns.items():
                        if pattern in text_str:
                            results['malformed_by_row'].append({
                                'Row': idx,
                                'Column': col,
                                'Malformed_Tag': pattern,
                                'Suggested_Repair': replacement,
                                'Context': text_str[:100]
                            })
                    
                    # ADD MISMATCH DETECTION
                    mismatches = self.detect_mismatches(text_str)
                    for mismatch in mismatches:
                        results['malformed_by_row'].append({
                            'Row': idx,
                            'Column': col,
                            'Malformed_Tag': f"<{mismatch['tag']}>",
                            'Suggested_Repair': mismatch['issue'],
                            'Context': text_str[:100]
                        })
                        
        for idx, row in df.iterrows():
            for col in df.columns:
                cell_value = row[col]
                if pd.notna(cell_value) and cell_value != '':
                    text_str = str(cell_value)
                    
                    # Check regex patterns
                    for pattern, replacement in self.regex_patterns.items():
                        if re.search(pattern, text_str):
                            results['malformed_by_row'].append({
                                'Row': idx,
                                'Column': col,
                                'Malformed_Tag': f"REGEX:{pattern}",
                                'Suggested_Repair': replacement,
                                'Context': text_str[:100]
                            })
        # Create summary
        if results['malformed_by_row']:
            pattern_counts = Counter(item['Malformed_Tag'] for item in results['malformed_by_row'])
            
            for pattern, count in pattern_counts.items():
                first_occurrence = next(item for item in results['malformed_by_row'] 
                                       if item['Malformed_Tag'] == pattern)
                
                results['malformed_summary'].append({
                    'Malformed_Tag': pattern,
                    'Count': count,
                    'Suggested_Repair': first_occurrence['Suggested_Repair'],
                    'Sample_Context': first_occurrence['Context']
                })
        
        malformed_by_row_df = pd.DataFrame(results['malformed_by_row'])
        malformed_summary_df = pd.DataFrame(results['malformed_summary'])
        
        if not malformed_summary_df.empty:
            malformed_summary_df = malformed_summary_df.sort_values('Count', ascending=False)
        
        return {
            'Malformed_Tags_by_Row': malformed_by_row_df,
            'Malformed_Tags_Summary': malformed_summary_df
        }
    
    def repair_tags(self, df: pd.DataFrame, malformed_analysis: Dict) -> pd.DataFrame:
        """Apply both malformed tag repairs AND mismatch repairs"""
        df_repaired = df.copy()
        
        if malformed_analysis['Malformed_Tags_Summary'].empty:
            return df_repaired
        
        # 1. Apply literal pattern replacements (existing logic)
        literal_repairs = 0
        for _, row in malformed_analysis['Malformed_Tags_Summary'].iterrows():
            pattern = row['Malformed_Tag']
            repair = row['Suggested_Repair']
            
            # Skip mismatches and regex patterns for now
            if "Mismatch:" in str(repair) or pattern.startswith("REGEX:"):
                continue
            
            # Apply literal pattern replacements
            if pattern in self.html_malformation_patterns:
                for col in df_repaired.columns:
                    if df_repaired[col].dtype == 'object':
                        before_count = df_repaired[col].astype(str).str.contains(
                            re.escape(pattern), regex=True
                        ).sum()
                        
                        df_repaired[col] = df_repaired[col].astype(str).str.replace(
                            pattern, repair, regex=False
                        )
                        
                        after_count = df_repaired[col].astype(str).str.contains(
                            re.escape(pattern), regex=True
                        ).sum()
                        
                        literal_repairs += (before_count - after_count)
        
        print(f"  Applied {literal_repairs} literal tag repairs")
        
        # 2. Apply mismatch repairs (NEW!)
        df_repaired = self.repair_tag_mismatches(df_repaired, malformed_analysis)
        
        return df_repaired

    def generate_mismatch_report(self, malformed_analysis: Dict) -> pd.DataFrame:
            """Generate a report of mismatches for manual review"""
            mismatch_rows = []
            
            for _, row in malformed_analysis['Malformed_Tags_Summary'].iterrows():
                if "Mismatch:" in str(row['Suggested_Repair']):
                    mismatch_rows.append({
                        'Tag': row['Malformed_Tag'],
                        'Issue': row['Suggested_Repair'],
                        'Occurrences': row['Count'],
                        'Sample_Context': row['Sample_Context'],
                        'Action_Required': 'Manual review needed'
                    })
            
            return pd.DataFrame(mismatch_rows)
    
    def _extract_mismatch_info(self, malformed_analysis: Dict) -> Dict:
        """Extract mismatch information from analysis"""
        mismatches = {}
        
        for _, row in malformed_analysis['Malformed_Tags_Summary'].iterrows():
            if "Mismatch:" in str(row['Suggested_Repair']):
                tag = row['Malformed_Tag'].replace('<', '').replace('>', '')
                issue = row['Suggested_Repair']
                
                # Parse "Mismatch: X open, Y closed"
                import re
                match = re.search(r'Mismatch: (\d+) open, (\d+) closed', issue)
                if match:
                    mismatches[tag] = {
                        'open_count': int(match.group(1)),
                        'close_count': int(match.group(2)),
                        'occurrences': row['Count']
                    }
        
        return mismatches
    
    def _add_missing_closing_tags(self, df: pd.DataFrame, tag_name: str, missing_count: int) -> tuple[pd.DataFrame, int]:
        """Add missing closing tags at logical boundaries"""
        df_repaired = df.copy()
        total_repairs = 0
        
        for col in df_repaired.columns:
            if df_repaired[col].dtype != 'object':
                continue
                
            for idx in df_repaired.index:
                text = df_repaired.at[idx, col]
                if pd.isna(text) or text == '':
                    continue
                
                text_str = str(text)
                # Check if this cell has the tag mismatch
                open_count = len(re.findall(f"<{tag_name}(?:\\s+[^>]*)?>" , text_str, re.IGNORECASE))
                close_count = len(re.findall(f'</{tag_name}>', text_str, re.IGNORECASE))
                
                if open_count > close_count:
                    cell_missing = open_count - close_count
                    repaired_text = self._repair_text_missing_closes(text_str, tag_name, cell_missing)
                    
                    if repaired_text != text_str:
                        df_repaired.at[idx, col] = repaired_text
                        total_repairs += cell_missing
        
        return df_repaired, total_repairs
    
    def _repair_text_missing_closes(self, text: str, tag_name: str, missing_count: int) -> str:
        """Apply smart heuristics to add missing closing tags"""
        if tag_name not in self.tag_closing_heuristics:
            return text
        
        rules = self.tag_closing_heuristics[tag_name]
        repaired_text = text
        remaining_to_fix = missing_count
        
        # Try each heuristic rule in order of preference
        for rule in rules:
            if remaining_to_fix <= 0:
                break
                
            # Find all potential closing positions
            matches = list(re.finditer(rule, repaired_text))
            
            # Insert closing tags (in reverse order to preserve positions)
            insert_count = min(len(matches), remaining_to_fix)
            for match in reversed(matches[:insert_count]):
                pos = match.start()
                repaired_text = (repaired_text[:pos] + 
                               f'</{tag_name}>' + 
                               repaired_text[pos:])
                remaining_to_fix -= 1
        
        return repaired_text
    
    def _add_missing_opening_tags(self, text: str, tag_name: str, missing_count: int) -> str:
        """Add opening tags before orphaned closing tags using smart heuristics"""
        
        # Define backward-looking heuristics for different tag types
        opening_heuristics = {
            'bibl': [
                r'([^<>\n]{1,100})(\s*</bibl>)', # Longer content for bibliography
            ],
        }
        
        if tag_name not in opening_heuristics:
            return text
        
        repaired_text = text
        rules = opening_heuristics[tag_name]
        remaining_to_fix = missing_count
        
        # Apply heuristics to add opening tags
        for rule in rules:
            if remaining_to_fix <= 0:
                break
                
            # Find matches and add opening tags (process in reverse to preserve positions)
            matches = list(re.finditer(rule, repaired_text))
            for match in reversed(matches[:remaining_to_fix]):
                content = match.group(1)
                closing_part = match.group(2)
                
                # Insert opening tag before the content
                replacement = f'<{tag_name}>{content}{closing_part}'
                start, end = match.span()
                repaired_text = repaired_text[:start] + replacement + repaired_text[end:]
                remaining_to_fix -= 1
        
        return repaired_text
    
    def _fix_orphaned_closing_tags(self, df: pd.DataFrame, tag_name: str, orphaned_count: int) -> tuple[pd.DataFrame, int]:
        """Add missing opening tags for orphaned closing tags"""
        df_repaired = df.copy()
        total_additions = 0
        
        for col in df_repaired.columns:
            if df_repaired[col].dtype != 'object':
                continue
                
            for idx in df_repaired.index:
                text = df_repaired.at[idx, col]
                if pd.isna(text) or text == '':
                    continue
                
                text_str = str(text)
                open_count = len(re.findall(f"<{tag_name}(?:\\s+[^>]*)?>" , text_str, re.IGNORECASE))
                close_count = len(re.findall(f'</{tag_name}>', text_str, re.IGNORECASE))
                
                if close_count > open_count:
                    excess_closes = close_count - open_count
                    repaired_text = self._add_missing_opening_tags(text_str, tag_name, excess_closes)
                    
                    if repaired_text != text_str:
                        df_repaired.at[idx, col] = repaired_text
                        total_additions += excess_closes
        
        return df_repaired, total_additions
    
    def repair_tag_mismatches(self, df: pd.DataFrame, malformed_analysis: Dict) -> pd.DataFrame:
        """Repair ONLY auto-repairable tag mismatches (currently just bibl)"""
        df_repaired = df.copy()
        repairs_made = 0
        manual_review_needed = []
        
        # Extract mismatch information from analysis
        mismatch_info = self._extract_mismatch_info(malformed_analysis)
        
        for tag_name, details in mismatch_info.items():
            open_count = details['open_count']
            close_count = details['close_count']
            
            if tag_name in self.auto_repairable_tags:
                # AUTO-REPAIR: Only bibl tags
                if open_count > close_count:
                    missing_closes = open_count - close_count
                    df_repaired, fixed_count = self._add_missing_closing_tags(
                        df_repaired, tag_name, missing_closes
                    )
                    repairs_made += fixed_count
                    print(f"  ✓ AUTO-FIXED: Added {fixed_count} missing </{tag_name}> tags")
                    
                elif close_count > open_count:
                    orphaned_closes = close_count - open_count
                    df_repaired, added_count = self._fix_orphaned_closing_tags(
                        df_repaired, tag_name, orphaned_closes
                    )
                    repairs_made += added_count
                    print(f"  ✓ AUTO-FIXED: Added {added_count} missing <{tag_name}> opening tags")
            else:
                # MANUAL REVIEW: All other tags
                manual_review_needed.append({
                    'tag': tag_name,
                    'open_count': open_count,
                    'close_count': close_count,
                    'occurrences': details['occurrences']
                })
                print(f"MANUAL REVIEW: <{tag_name}> has {open_count} open, {close_count} closed ({details['occurrences']} occurrences)")
        
        if manual_review_needed:
            print(f"\nSUMMARY: {len(manual_review_needed)} tag types need manual review:")
            for item in manual_review_needed:
                print(f"   - <{item['tag']}>: {item['open_count']} open, {item['close_count']} closed")
        
        print(f"Total automatic repairs: {repairs_made}")
        return df_repaired
        
    def find_mismatch_cells(self, df: pd.DataFrame, target_tags: list) -> pd.DataFrame:
        """Find specific cells that contain tag mismatches"""
        problem_cells = []
        
        for col in df.columns:
            if df[col].dtype != 'object':
                continue
                
            for idx in df.index:
                text = df.at[idx, col]
                if pd.isna(text) or text == '':
                    continue
                    
                text_str = str(text)
                
                for tag_name in target_tags:
                    if tag_name in self.self_closing_tags:
                        continue
                        
                    # Count tags
                    open_count = len(re.findall(f"<{tag_name}(?:\\s+[^>]*)?>" , text_str, re.IGNORECASE))
                    close_count = len(re.findall(f'</{tag_name}>', text_str, re.IGNORECASE))
                    
                    if open_count != close_count:
                        problem_cells.append({
                            'Row': idx + 1,  # +1 for Excel-style numbering
                            'Column': col,
                            'Tag': tag_name,
                            'Open_Count': open_count,
                            'Close_Count': close_count,
                            'Content_Preview': text_str[:200] + '...' if len(text_str) > 200 else text_str
                        })
        
        return pd.DataFrame(problem_cells).sort_values(['Tag', 'Row'])  
    
    def get_repair_statistics(self, malformed_analysis: Dict) -> Dict:
            """Enhanced statistics showing auto vs manual repair breakdown"""
            stats = {
                'total_issues': len(malformed_analysis['Malformed_Tags_by_Row']),
                'unique_patterns': len(malformed_analysis['Malformed_Tags_Summary']),
                'auto_fixable': 0,
                'manual_review_needed': 0,
                'literal_malformations': 0,
                'tag_mismatches': 0,
                'bibl_mismatches': 0,  # Track bibl specifically
                'other_mismatches': 0
            }
            
            for _, row in malformed_analysis['Malformed_Tags_Summary'].iterrows():
                if "Mismatch:" in str(row['Suggested_Repair']):
                    stats['tag_mismatches'] += 1
                    tag_name = row['Malformed_Tag'].replace('<', '').replace('>', '')
                    
                    if tag_name in self.auto_repairable_tags:
                        stats['auto_fixable'] += row['Count']
                        stats['bibl_mismatches'] += row['Count']
                    else:
                        stats['manual_review_needed'] += row['Count'] 
                        stats['other_mismatches'] += row['Count']
                else:
                    stats['auto_fixable'] += row['Count']
                    stats['literal_malformations'] += 1
            
            return stats

    def generate_manual_review_report(self, malformed_analysis: Dict) -> pd.DataFrame:
        """Generate a focused report for manual review (non-bibl tags)"""
        manual_rows = []
        
        for _, row in malformed_analysis['Malformed_Tags_Summary'].iterrows():
            if "Mismatch:" in str(row['Suggested_Repair']):
                tag_name = row['Malformed_Tag'].replace('<', '').replace('>', '')
                
                if tag_name not in self.auto_repairable_tags:
                    manual_rows.append({
                        'Tag': row['Malformed_Tag'],
                        'Issue': row['Suggested_Repair'],
                        'Occurrences': row['Count'],
                        'Sample_Context': row['Sample_Context'],
                        'Action_Required': 'Manual review needed',
                        'Issue_Type': 'Tag Mismatch'
                    })
        
        return pd.DataFrame(manual_rows)

### Step 5.2 NonHTMLTagProcessor

In [66]:
class NonHTMLTagProcessor:
    """Handles encoding of non-HTML angle brackets - OPTIMIZED VERSION"""
    
    def __init__(self):
        self.html_analyzer = HTMLTagHandler()
        
        # Known HTML tags that should never be encoded
        self.valid_html_tags = {
            'p', 'br', 'div', 'span', 'a', 'b', 'i', 'u', 'strong', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table',
            'tr', 'td', 'th', 'img', 'bibl', 'title', 'sup', 'sub', 'del'
        }
    
    def find_non_html_tags(self, text: str) -> List[str]:
        """Find tags that are definitely not HTML"""
        if pd.isna(text) or text == '':
            return []
        
        # Find all < ... > patterns
        pattern = r'<[^<>]*>'
        all_brackets = re.findall(pattern, str(text))
        
        non_html_tags = []
        for tag in all_brackets:
            # Check if it's valid HTML
            tag_type, _ = self.html_analyzer.classify_tag(tag)
            if tag_type == 'non_html':
                non_html_tags.append(tag)
        
        return non_html_tags
    
    def analyze_non_html_tags(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """OPTIMIZED: Analyze non-HTML tags across DataFrame efficiently"""
        results = {
            'non_html_by_row': [],
            'non_html_summary': []
        }
        
        # Collect all unique non-HTML tags first
        unique_non_html_tags = set()
        tag_locations = defaultdict(list)
        
        # Process in chunks for better memory efficiency
        for idx, row in df.iterrows():
            # Process multiple columns at once
            for col in df.columns:
                cell_value = row[col]
                if pd.notna(cell_value) and cell_value != '':
                    non_html_tags = self.find_non_html_tags(cell_value)
                    for tag in non_html_tags:
                        unique_non_html_tags.add(tag)
                        tag_locations[tag].append({
                            'Row': idx,
                            'Column': col,
                            'Context': str(cell_value)[:100]
                        })
        
        # Build results from collected data
        for tag, locations in tag_locations.items():
            for loc in locations:
                results['non_html_by_row'].append({
                    'Row': loc['Row'],
                    'Column': loc['Column'],
                    'Non_HTML_Tag': tag,
                    'Context': loc['Context']
                })
        
        # Create summary
        if unique_non_html_tags:
            summary_data = []
            for tag in unique_non_html_tags:
                summary_data.append({
                    'Tag': tag,
                    'Count': len(tag_locations[tag]),
                    'Encoded_Version': tag.replace('<', '&lt;').replace('>', '&gt;')
                })
            results['non_html_summary'] = summary_data
        
        non_html_by_row_df = pd.DataFrame(results['non_html_by_row'])
        non_html_summary_df = pd.DataFrame(results['non_html_summary'])
        
        return {
            'Non_HTML_Tags_by_Row': non_html_by_row_df,
            'Non_HTML_Tags_Summary': non_html_summary_df
        }
    
    def encode_brackets(self, df: pd.DataFrame, non_html_analysis: Dict) -> pd.DataFrame:
        """OPTIMIZED: Apply non-HTML bracket encoding using vectorized operations"""
        df_encoded = df.copy()
        
        # Get unique non-HTML tags from analysis
        if not non_html_analysis['Non_HTML_Tags_Summary'].empty:
            # Process all replacements at once using vectorized operations
            for _, row in non_html_analysis['Non_HTML_Tags_Summary'].iterrows():
                tag = row['Tag']
                encoded_tag = row['Encoded_Version']
                
                # Apply to all string columns at once
                for col in df_encoded.columns:
                    if df_encoded[col].dtype == 'object':  # Only process string columns
                        # Use vectorized string replacement
                        mask = df_encoded[col].astype(str).str.contains(
                            re.escape(tag), regex=True, na=False
                        )
                        if mask.any():
                            df_encoded.loc[mask, col] = df_encoded.loc[mask, col].astype(str).str.replace(
                                tag, encoded_tag, regex=False
                            )
        
        return df_encoded

### Step 5.3 Link Validator `<a>` tags

In [67]:
class LinkValidator:
    def __init__(self):
        # regex for raw <http://...> or <https://...>
        self.raw_link_pattern = re.compile(r'<(https?://[^>\s]+)>')
        # regex for opening <a> tags
        self.anchor_open_pattern = re.compile(r'<a([^>]*)>', flags=re.IGNORECASE)
        # regex for closing </a>
        self.anchor_close_pattern = re.compile(r'</a>', flags=re.IGNORECASE)
    
    def has_href_attribute(self, attrs_string):
        # Look for href attribute with optional whitespace around =
        href_pattern = r'href\s*=\s*["\'][^"\']*["\']'
        return bool(re.search(href_pattern, attrs_string, re.IGNORECASE))
   
    def validate_links(self, text: str) -> List[Dict]:
        """
        Validates links inside a text string.
        - Finds raw URLs in <> that should be <a href="">
        - Ensures <a> tags have href
        - Ensures <a> tags are properly closed
        """
        if pd.isna(text) or text == '':
            return []
        issues = []
        text_str = str(text)
        
        # 1. Detect raw <http://...>
        for url in self.raw_link_pattern.findall(text_str):
            issues.append({
                "Issue": "Raw URL in angle brackets",
                "URL": url,
                "Suggested_Fix": f'<a href="{url}">{url}</a>'
            })
        
        # 2. Detect <a> tags without href
        for attrs in self.anchor_open_pattern.findall(text_str):
            if not self.has_href_attribute(attrs):
                issues.append({
                    "Issue": "Anchor tag missing href",
                    "Tag": f"<a{attrs}>",
                    "Suggested_Fix": '<a href="URL_HERE">'
                })
        
        # 3. Check open/close balance
        open_count = len(self.anchor_open_pattern.findall(text_str))
        close_count = len(self.anchor_close_pattern.findall(text_str))
        if open_count != close_count:
            issues.append({
                "Issue": "Anchor tag mismatch",
                "Opens": open_count,
                "Closes": close_count,
                "Suggested_Fix": "Ensure each <a> has matching </a>"
            })
        
        return issues
    
    def repair_a_tags(self, df_columns: pd.DataFrame) -> pd.DataFrame:
        """
        Repair <a> tag issues in the dataframe columns.
        Currently handles:
        - Converting raw URLs in angle brackets to proper <a href=""> tags
        """
        repaired_df = df_columns.copy()
        repairs_made = 0
        
        for col in repaired_df.columns:
            for idx in repaired_df.index:
                value = repaired_df.at[idx, col]
                
                if pd.isna(value) or value == '':
                    continue
                    
                text_str = str(value)
                original_text = text_str
                
                # Fix raw URLs in angle brackets
                def replace_raw_url(match):
                    url = match.group(1)
                    return f'<a href="{url}">{url}</a>'
                
                text_str = self.raw_link_pattern.sub(replace_raw_url, text_str)
                
                # Update if changes were made
                if text_str != original_text:
                    repaired_df.at[idx, col] = text_str
                    repairs_made += 1
        
        print(f"  Repaired {repairs_made} raw URL issues")
        return repaired_df
    
    def analyze_link_issues(self, df_columns: pd.DataFrame) -> Dict:
        """
        Analyze all link issues in the dataframe columns.
        Returns structured analysis similar to other analyzers.
        """
        all_issues = []
        
        for col in df_columns.columns:
            for idx in df_columns.index:
                value = df_columns.at[idx, col]
                issues = self.validate_links(value)
                
                for issue in issues:
                    issue["Row"] = idx + 1  # Fix: Add 1 to match actual row numbers
                    issue["Column"] = col
                    all_issues.append(issue)
        
        issues_df = pd.DataFrame(all_issues) if all_issues else pd.DataFrame()
        
        return {
            'Link_Issues_by_Row': issues_df,
            'Total_Issues': len(all_issues),
            'Repairable_Issues': len([i for i in all_issues if i['Issue'] == 'Raw URL in angle brackets'])
        }

## Step 6 HTML Processor

In [68]:
class HTMLProcessor:
    """Main processor that coordinates all operations - OPTIMIZED"""
    
    def __init__(self):
        # Use YOUR existing comprehensive classes
        self.tag_handler = HTMLTagHandler()
        self.malformed_repairer = MalformedTagRepairer()  # Your comprehensive repairer
        self.non_html_processor = NonHTMLTagProcessor()  # Your comprehensive processor
        self.link_validator = LinkValidator()
        self.stats = {}
        self.content_columns = [
            'Principal English Translation',
            'Attestations from sources in English',
            'Attestations from sources in Spanish',
            'Alonso de Molina',
            'Frances Karttunen', 
            'Horacio Carochi / English',
            'Andrés de Olmos',
            "Lockhart’s Nahuatl as Written",
            
        ]
    
    
    def process_dataframe(
        self, df: pd.DataFrame, 
        repair_malformed: bool = True,
        validate_links: bool = True,
        extract_citations: bool = True,
        extract_crossrefs: bool = True,
        encode_non_html: bool = True
    ) -> Tuple[pd.DataFrame, Dict]:
        
        """
        Process entire dataframe efficiently, including:
        - Malformed HTML repair
        - Non-HTML tag encoding
        - <a> link validation
        Returns: (processed_df, analysis_report)
        """
        print("=" * 70)
        print("HTML PROCESSING PIPELINE")
        print("=" * 70)
        
        result_df = df.copy()
        
        # Add extraction columns if needed
        if extract_citations:
            result_df['Citations'] = ''
            result_df['Number_of_Citations'] = 0
        if extract_crossrefs:
            result_df['Cross_References'] = ''
            result_df['Number_of_Cross_References'] = 0
        
        analysis = {
            'repairs_made': 0,
            'citations_extracted': 0,
            'crossrefs_extracted': 0,
            'non_html_encoded': 0,
            'tag_analysis': [],
            'processing_details': []
        }
        report = {}
        reports = []  # collect all issues for unified mismatch report
        total_rows = len(result_df)
        print(f"\nProcessing {total_rows:,} rows...")

        # -------------------------------
        # STEP 1: Malformed HTML tags
        # -------------------------------
        if repair_malformed:
            print("  Analyzing malformed tags...")
            malformed_analysis = self.malformed_repairer.analyze_malformed_tags(result_df[self.content_columns])

            if not malformed_analysis['Malformed_Tags_Summary'].empty:
                stats = self.malformed_repairer.get_repair_statistics(malformed_analysis)
                print(f"  Found {stats['total_issues']} malformed tag instances")
                print(f"  Found {stats['unique_patterns']} unique malformed patterns")
                print(f"    - Auto-fixable: {stats['auto_fixable']}")
                print(f"    - Manual review needed: {stats['manual_review_needed']}")

                mismatch_report = self.malformed_repairer.generate_mismatch_report(malformed_analysis)
                if not mismatch_report.empty:
                    mismatch_report = mismatch_report.copy()
                    mismatch_report["Issue_Type"] = "Malformed Tag"
                    reports.append(mismatch_report)

                print("  Repairing malformed tags...")
                result_df[self.content_columns] = self.malformed_repairer.repair_tags(
                    result_df[self.content_columns],
                    malformed_analysis
                )

                analysis['repairs_made'] = stats['auto_fixable']
                analysis['bibl_repairs'] = stats.get('bibl_mismatches', 0)
                analysis['repair_stats'] = stats
                # optional raw malformed data
                analysis['malformed_repairs'] = malformed_analysis
            else:
                print("  No malformed tags found")

        # -------------------------------
        # STEP 2: Non-HTML tags
        # -------------------------------
        if encode_non_html:
            print("  Analyzing non-HTML tags...")
            non_html_analysis = self.non_html_processor.analyze_non_html_tags(result_df[self.content_columns])

            if not non_html_analysis['Non_HTML_Tags_Summary'].empty:
                print(f"  Found {len(non_html_analysis['Non_HTML_Tags_by_Row'])} non-HTML tag instances")
                print(f"  Found {len(non_html_analysis['Non_HTML_Tags_Summary'])} unique non-HTML tags")

                # Encode brackets in the dataframe
                result_df[self.content_columns] = self.non_html_processor.encode_brackets(
                    result_df[self.content_columns],
                    non_html_analysis
                )

                analysis['non_html_encoded'] = len(non_html_analysis['Non_HTML_Tags_by_Row'])
                analysis['non_html_encoding'] = non_html_analysis

                # Prepare report entries
                non_html_rows = []
                non_html_df = non_html_analysis['Non_HTML_Tags_by_Row']
                for _, row in non_html_df.iterrows():
                    non_html_rows.append({
                        "Row": row['Row'],
                        "Column": row['Column'],
                        "Issue": row['Non_HTML_Tag'],
                        "Issue_Type": "Non-HTML Tag"
                    })
                if non_html_rows:
                    reports.append(pd.DataFrame(non_html_rows))
            else:
                print("  No non-HTML tags found")

        # -------------------------------
        # STEP 3: Link validation
        # -------------------------------
        if validate_links:
            print("  Analyzing link issues...")
            link_analysis = self.link_validator.analyze_link_issues(result_df[self.content_columns])
            
            if link_analysis['Total_Issues'] > 0:
                print(f"  Found {link_analysis['Total_Issues']} link issues")
                print(f"  Repairable issues: {link_analysis['Repairable_Issues']}")
                
                # Repair what we can
                if link_analysis['Repairable_Issues'] > 0:
                    print("  Repairing link issues...")
                    result_df[self.content_columns] = self.link_validator.repair_a_tags(
                        result_df[self.content_columns]
                    )
                
                # Add remaining issues to report
                if not link_analysis['Link_Issues_by_Row'].empty:
                    link_report = link_analysis['Link_Issues_by_Row'].copy()
                    link_report["Issue_Type"] = "Link Issue"
                    reports.append(link_report)
                    
                analysis['link_repairs'] = link_analysis['Repairable_Issues']
                analysis['link_analysis'] = link_analysis
            else:
                print("  No link issues found")

        # -------------------------------
        # STEP 4: Unified mismatch report
        # -------------------------------
        if reports:
            unified_report = pd.concat(reports, ignore_index=True)
        else:
            unified_report = pd.DataFrame(columns=["Row", "Column", "Issue", "Issue_Type"])

        report['analysis'] = analysis
        report["mismatch_report"] = unified_report

        self._print_summary(analysis)
        return result_df, report

    def _print_summary(self, analysis: Dict):
        """Print processing summary"""
        print("\n" + "=" * 70)
        print("PROCESSING COMPLETE")
        print("=" * 70)
        print(f"Repairs made:        {analysis['repairs_made']:,}")
        print(f"Citations extracted: {analysis['citations_extracted']:,}")
        print(f"Cross-refs extracted: {analysis['crossrefs_extracted']:,}")
        print(f"Non-HTML encoded:    {analysis['non_html_encoded']:,}")

## Main Processor Class

In [69]:

class NahuatlProcessor:
    """Main processor class that coordinates all operations"""
    
    def __init__(self, db_path: str = DB_PATH, working_dir: str = WORKING_DIR):
        """Initialize the processor with all necessary components"""
        
        print("=" * 70)
        print("NAHUATL DATA PROCESSOR - HYBRID APPROACH")
        print("=" * 70)
        
        # Initialize components
        self.html_processor = HTMLProcessor()
        self.loader = DataLoader()
        self.saver = DataSaver(working_dir)
        self.db = DatabaseManager(db_path)
        
        # Data containers
        self.original_df = None
        self.working_df = None
        
        # Processing state
        self.current_stage = 'initialized'
        self.processing_history = []
        
        # Verify base dataset exists
        if not self.db.verify_base_dataset():
            print("\n No base dataset found in SQLite")
        print("   Run processor.initial_import() to import your CSV data first")
        print("Processor initialized and ready")
        print("=" * 70)
    
    def initial_import(self, csv_path: str, table_name: str = 'WHP_EarlyNahuatl_Data', 
                      encoding: str = DEFAULT_ENCODING, replace: bool = False):
        """
        Perform initial import of CSV data into SQLite.
        This establishes the base dataset in the database.
        """
        
        success = self.db.import_initial_dataset(
            csv_path=csv_path,
            table_name=table_name,
            encoding=encoding,
            replace=replace
        )
        
        if success:
            print("\nInitial import complete!")
            print("   You can now proceed with data processing")
        else:
            print("\nInitial import failed")
            
        return success
    
    
    def load_data(self, source: str, source_type: str = 'auto', **kwargs) -> pd.DataFrame:
        """Load data from any source"""
        
        start_time = datetime.now()
        
        # Auto-detect source type if needed
        if source_type == 'auto':
            source_type = 'csv' if source.endswith('.csv') else 'sqlite'
        
        # Load based on source type
        if source_type == 'csv':
            df = self.loader.load_from_csv(source, **kwargs)
        elif source_type == 'sqlite':
            table_name = kwargs.get('table_name', 'WHP_EarlyNahuatl_Data')
            df = self.loader.load_from_sqlite(source, table_name)
        else:
            raise ValueError(f"Unknown source type: {source_type}")
        
        # Create working copies
        self.original_df, self.working_df = self.loader.create_working_copy(df)
        
        # Save initial checkpoint
        self.save_checkpoint('initial')
        
        # Log the operation
        duration = (datetime.now() - start_time).total_seconds()
        self.db.log_processing_stage(
            stage='data_loading',
            status='completed',
            duration=duration,
            rows_processed=len(df),
            notes=f"Loaded from {source_type}: {source}"
        )
        
        self.current_stage = 'data_loaded'
        
        return self.working_df
    
    def save_checkpoint(
        self,
        checkpoint_name: str,
        additional_data: Optional[Dict[str, pd.DataFrame]] = None
    ):
        """Enhanced checkpoint saving with additional reports"""
        if self.working_df is not None and self.db.conn is not None:
            # Save main checkpoint
            self.saver.save_checkpoint_to_sqlite(
                self.working_df,
                checkpoint_name,
                self.db.conn
            )
            
            # Save additional data if provided (like mismatch reports)
            if additional_data:
                for data_name, data_df in additional_data.items():
                    if isinstance(data_df, pd.DataFrame) and not data_df.empty:
                        table_name = f'{data_name}_{checkpoint_name}_{datetime.now().strftime("%Y%m%d")}'
                        data_df.to_sql(
                            table_name,
                            self.db.conn,
                            if_exists='replace',
                            index=False
                        )
                        print(f"  Saved {data_name} to {table_name}")
            
            self.processing_history.append({
                'checkpoint': checkpoint_name,
                'timestamp': datetime.now(),
                'rows': len(self.working_df),
                'columns': len(self.working_df.columns)
            })
        else:
            print(f"Warning: Cannot save checkpoint '{checkpoint_name}' - no data or connection available")
    
    def export_final_results(self, filename_base: str = 'final_nahuatl_data'):
        """Export final results to CSV with encoding protection"""
        
        if self.working_df is None:
            print("Error: No working data to export")
            return
        
        if self.original_df is None:
            print("Warning: No original data for comparison")
            original_rows = 0
            original_cols = 0
        else:
            original_rows = len(self.original_df)
            original_cols = len(self.original_df.columns)
        
        # Save to CSV
        csv_path = self.saver.save_to_csv(self.working_df, f"{filename_base}.csv")
        
        # Save final checkpoint
        self.save_checkpoint('final')
        
        # Create summary report
        summary = {
            'processing_summary': pd.DataFrame(self.processing_history),
            'data_info': pd.DataFrame([{
                'original_rows': original_rows,
                'final_rows': len(self.working_df),
                'original_columns': original_cols,
                'final_columns': len(self.working_df.columns),
                'stages_completed': len(self.processing_history)
            }])
        }
        
        # Save summary to Excel
        excel_path = self.saver.save_to_excel(
            summary,
            f"{filename_base}_summary_{datetime.now().strftime('%Y%m%d')}.xlsx"
        )
        
        print("\n" + "=" * 70)
        print("EXPORT COMPLETE")
        print(f"CSV: {csv_path}")
        print(f"Summary: {excel_path}")
        print("=" * 70)
    
    def load_data_from_base(self, table_name: str = 'WHP_EarlyNahuatl_Data') -> pd.DataFrame:
        """Convenience method to load from the base SQLite dataset"""
        
        if not self.db.verify_base_dataset(table_name):
            raise ValueError(f"Base dataset '{table_name}' not found. Run initial_import() first.")
        
        return self.load_data(
            source=self.db.db_path,
            source_type='sqlite',
            table_name=table_name
        )
        
    def process_html_tags(self, save_checkpoint: bool = True):
        """Step 4-5: HTML tag processing"""
        if self.working_df is None:
            raise ValueError("No data loaded. Run load_data() first.")
        
        # Delegate to HTML processor
        self.working_df, report = self.html_processor.process_dataframe(
            self.working_df,
            repair_malformed=True,
            encode_non_html=True,
            validate_links=True,
        )
        
        if save_checkpoint:
            additional_data = {}
            
            # The HTMLProcessor already creates a unified mismatch_report
            # that includes ALL issues (malformed tags, non-HTML tags, link issues)
            # Each row already has an 'Issue_Type' column
            if 'mismatch_report' in report and not report['mismatch_report'].empty:
                additional_data['mismatch_report'] = report['mismatch_report']
                
                # Print summary by issue type
                print("\nIssue Summary:")
                issue_counts = report['mismatch_report']['Issue_Type'].value_counts()
                for issue_type, count in issue_counts.items():
                    print(f"  - {issue_type}: {count} issues")
                    
                print(f"\nTotal issues found: {len(report['mismatch_report'])}")
            else:
                print("\nNo issues found - data is clean!")
            
            # Save checkpoint with the unified report
            self.save_checkpoint('after_html_processing', additional_data)
            
        return report    

    def cleanup(self):
        """Clean up resources"""
        self.db.close()

In [70]:
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    # Initialize processor
    processor = NahuatlProcessor()
    
    # processor.initial_import(csv_path='../../data/raw/WHP_EarlyNahuatl_data_2024-03-26T17-22-58.csv')
    
    processor.load_data("../../data/sqLiteDb/nahuatl_processing.db", source_type='sqlite', table_name='checkpoint_after_html_processing_20250914')
    processor.process_html_tags()
    # Example 1: Load from existing SQLite
    # processor.load_data(
    #     source='../../data/sqLiteDb/Whp_Raw_Dataset.db',
    #     source_type='sqlite',
    #     table_name='WHP_EarlyNahuatl_Data'
    # )
    
    # Example 2: Load from CSV
    # processor.load_data(
    #     source='data/nahuatl_data.csv',
    #     source_type='csv',
    #     auto_detect_encoding=True
    # )
    
    # Your processing steps would go here...
    # processor.working_df = repair_malformed_characters(processor.working_df)
    # processor.save_checkpoint('after_repair')
    
    # processor.working_df = extract_citations(processor.working_df)
    # processor.save_checkpoint('after_citations')
    
    # Export final results
    processor.export_final_results()
    
    # Clean up
    # processor.cleanup()

NAHUATL DATA PROCESSOR - HYBRID APPROACH
Database initialized: ../../data/sqLiteDb/nahuatl_processing.db
Existing database objects: 8
   - checkpoint_metadata (table): 31 rows
   - sqlite_sequence (table): 2 rows
   - processing_log (table): 17 rows
   - WHP_EarlyNahuatl_Data (table): 31,806 rows
   - import_history (table): 1 rows
   - checkpoint_initial_20250914 (table): 31,806 rows
   - checkpoint_after_html_processing_20250914 (table): 31,806 rows
   - mismatch_report_after_html_processing_20250914 (table): 97 rows
   Base dataset verified:
   Table: WHP_EarlyNahuatl_Data
   Rows: 31,806
   Columns: 13
   Run processor.initial_import() to import your CSV data first
Processor initialized and ready
Loading data from SQLite: ../../data/sqLiteDb/nahuatl_processing.db
Table: checkpoint_after_html_processing_20250914
Successfully loaded 31,806 rows × 17 columns
Created working copy of data
Checkpoint saved to SQLite: checkpoint_initial_20250914
HTML PROCESSING PIPELINE

Processing 31,806

At this point we've done enough thorough analysis and for the sake of correctness, manual correction will be done on the checkpoint table in the above log. After the corrections are made we will begin extracting citations and cross references.