# Nahuatl Notebook for the WHP_EarlyNahuatl_Dataset

This notebook processes Nahuatl dictionary data, analyzing HTML tags, repairing malformed tags, and extracting citations and cross-references. This is a merged version of Todd's version and I where there is a SQLite-based data management approach.

## Setup and Imports

In [1]:
import pandas as pd
import re
import os
import numpy as np
import hashlib
import glob
import csv
import sqlite3
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional, Union
from inscriptis import get_text
from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime

In [2]:
# Create working directory
os.makedirs('working_files', exist_ok=True)

# load in the SQLite database holding the WHP Dataset
conn = sqlite3.connect('../../data/sqLiteDb/Whp_Raw_Dataset.db')
table_name = "WHP_EarlyNahuatl_Data"

tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, conn)
tables


# If there's issues check the following
# Possible solutions:
# 1. Ensure the db file is in the correct directory
# 2. Check the exact filename
# 3. Verify the file extension

Unnamed: 0,name
0,WHP_EarlyNahuatl_Data


In [3]:
def save_to_excel(data_dict: Dict[str, pd.DataFrame], filename: str, directory: str = 'working_files'):
    """Save multiple DataFrames as sheets in an Excel file"""
    filepath = os.path.join(directory, filename)
    with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
        for sheet_name, df in data_dict.items():
            # Truncate sheet name if too long (Excel limit is 31 characters)
            clean_sheet_name = sheet_name[:31] if len(sheet_name) > 31 else sheet_name
            df.to_excel(writer, sheet_name=clean_sheet_name, index=False)
    print(f"Saved to: {filepath}")

In [4]:
def save_dataframe(df: pd.DataFrame, filename: str, directory: str = 'working_files'):
    """Save a single DataFrame to CSV"""
    filepath = os.path.join(directory, filename)
    df.to_csv(filepath, index=False)
    print(f"Saved to: {filepath}")

In [5]:
def save_to_sqlite(df: pd.DataFrame, table_name: str, conn: sqlite3.Connection, if_exists: str = 'replace'):
    """Save DataFrame to SQLite table"""
    df.to_sql(table_name, conn, if_exists=if_exists, index=False)
    print(f"Saved to SQLite table: {table_name}")

## Step 1: Import Data and Create Working Copy

In [6]:
def load_data_from_csv(filename: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data and create a working copy"""
    print(f"Loading data from: {filename}")

    # Read the original data
    original_df = pd.read_csv(filename)

    # Create working copy
    working_df = original_df.copy()

    print(f"Data loaded successfully:")
    print(f"- Shape: {original_df.shape}")
    print(f"- Columns: {list(original_df.columns)}")

    return original_df, working_df

In [7]:
def load_data_from_sqlite(db_path: str, table_name: str = "WHP_EarlyNahuatl_Data") -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data from SQLite and create a working copy"""
    print(f"Loading data from: {db_path}")
    
    conn = sqlite3.connect(db_path)
    original_df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    working_df = original_df.copy()
    
    print(f"Data loaded successfully:")
    print(f"- Shape: {original_df.shape}")
    print(f"- Columns: {list(original_df.columns)}")
    
    # Don't close connection yet - return it for later use
    return original_df, working_df, conn

In [8]:
# Load your data

original_df = pd.read_sql("SELECT * FROM WHP_EarlyNahuatl_Data", conn)
df = original_df.copy(deep=True)

query = "SELECT * FROM WHP_EarlyNahuatl_Data LIMIT 3;"
whp_dataset = pd.read_sql(query, conn)
display(whp_dataset)

cursor = conn.execute(f"PRAGMA table_info({table_name})")
columns_info = cursor.fetchall()
column_names = [col[1] for col in columns_info]

print(column_names)

Unnamed: 0,Ref,Headword,Orthographic Variants,Principal English Translation,Attestations from sources in English,Attestations from sources in Spanish,Alonso de Molina,Frances Karttunen,Horacio Carochi / English,Andrés de Olmos,Lockhart’s Nahuatl as Written,themes,Spanish Loanword
0,WHP-171879,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No
1,WHP-171881,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,"<p>Ayac. ninguno, o nadie o estar alguno ausen...","<p>AYĀC no one / ninguno, o nadie (M) See AH-,...","<p>ayāc = no one<br /> <bibl>Horacio Carochi, ...",,"<p>no one; nobody; or, for someone to be absen...",,No
2,WHP-171882,acan.,,"<p>nowhere, no place (see Molina, Karttunen, L...",,,<p>acan. en ninguna parte o lugar. aduerbio.<b...,<p>AHCĀN nowhere / en ninguna parte o lugar (M...,"<p>àcān = nowhere<br /> <bibl>Horacio Carochi,...","<p>en ningun lugar, por, de, etc.<br /> <bibl>...",<p>ahcān = (particle) nowhere<br /> <bibl>Jame...,"Cardinal Directions, Cosmos",No


['Ref', 'Headword', 'Orthographic Variants', 'Principal English Translation', 'Attestations from sources in English', 'Attestations from sources in Spanish', 'Alonso de Molina', 'Frances Karttunen', 'Horacio Carochi / English', 'Andrés de Olmos', 'Lockhart’s Nahuatl as Written', 'themes', 'Spanish Loanword']


## Step 2: Save Intermediate Stages

In [9]:
def save_intermediate_stage(df: pd.DataFrame, stage_name: str):
    """Save intermediate processing stage"""
    filename = f"{stage_name}_stage.csv"
    save_dataframe(df, filename)
    return df

def save_intermediate_stage_sqlite(df: pd.DataFrame, stage_name: str, conn: sqlite3.Connection):
    """Save intermediate processing stage to SQLite"""
    table_name = f"{stage_name}_stage"
    save_to_sqlite(df, table_name, conn)
    return df

In [10]:
# Save initial stage
# save_intermediate_stage_sqlite(df, "01_initial", conn)

## Step 3: HTML Tag Analysis

In [11]:
class HTMLTagAnalyzer:
    def __init__(self):
        # HTML tags
        self.html_tags = {
            'p', 'br', 'div', 'span', 'a', 'b', 'i', 'u', 'strong', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'table',
            'tr', 'td', 'th', 'img', 'link', 'meta', 'head', 'body', 'html',
            'bibl', 'title', 'sup', 'sub', 'del'
        }
        
        # Define columns that should contain HTML content
        self.content_columns = [
            'Principal English Translation',
            'Attestations from sources in English',
            'Attestations from sources in Spanish',
            'Alonso de Molina',
            'Frances Karttunen', 
            'Horacio Carochi / English',
            'Andrés de Olmos',
            "Lockhart's Nahuatl as Written",
            'Full Original Entry'
        ]
        
        # Known malformed patterns to fix
        self.malformed_patterns = {
            r'</p</bibl>': '</p></bibl>',
            r'<bibl<': '<bibl>',
            r'</bibbl>': '</bibl>',
            r'<bibbl>': '<bibl>',
            r'<bobl>': '<bibl>',
            r'</bobl>': '</bibl>',
            r'<b9bl>': '<bibl>',
            r'<bibi>': '<bibl>',
            r'<bibl></p>': '</bibl></p>',
        }
    
    def detect_malformed_tags(self, text: str) -> List[tuple]:
        """Detect specific malformed tag patterns"""
        if pd.isna(text) or text == '':
            return []
        
        malformed_found = []
        text_str = str(text)
        
        # Check for known malformed patterns
        for pattern, replacement in self.malformed_patterns.items():
            if re.search(pattern, text_str):
                malformed_found.append((pattern, replacement))
        
        # Define self-closing tags that shouldn't be counted in pair matching
        self_closing_tags = {'br', 'hr', 'img', 'input', 'meta', 'link'}
        
        # Better tag counting using regex
        for tag_name in self.html_tags:
            if tag_name in self_closing_tags:
                continue  # Skip self-closing tags
            
            # Use regex to properly count opening tags (with or without attributes)
            # Matches <tag> or <tag attr="...">~
            open_pattern = f"<{tag_name}(?:\\s+[^>]*)?>"
            close_pattern = f'</{tag_name}>'
            
            open_count = len(re.findall(open_pattern, text_str, re.IGNORECASE))
            close_count = len(re.findall(close_pattern, text_str, re.IGNORECASE))
            
            if open_count != close_count:
                malformed_found.append((f'<{tag_name}>', f'Mismatch: {open_count} open, {close_count} closed'))
        return malformed_found
    
    def find_html_tags(self, text: str) -> List[str]:
        """Find all HTML-like tags in text with better handling of malformed tags"""
        if pd.isna(text) or text == '':
            return []
        
        # First fix known malformed patterns
        text_str = str(text)
        for pattern, replacement in self.malformed_patterns.items():
            text_str = re.sub(pattern, replacement, text_str)
        
        # Then find tags
        pattern = r'</?[^<>]+/?>'
        matches = re.findall(pattern, text_str)
        return matches
    
    def analyze_html_tags_in_dataframe(self, df: pd.DataFrame, 
                                      columns_to_check: List[str] = None) -> Dict[str, pd.DataFrame]:
        """Analyze HTML tags only in relevant columns"""
        results = {
            'tag_by_row': [],
            'tag_summary': [],
            'malformed_tags': []
        }
        
        # Use specified columns or default to content columns
        if columns_to_check is None:
            columns_to_check = [col for col in self.content_columns if col in df.columns]
        
        # Track tags by row - only in relevant columns
        for idx, row in df.iterrows():
            for col in columns_to_check:
                if col not in df.columns:
                    continue
                    
                cell_value = row[col]
                if pd.notna(cell_value) and cell_value != '':
                    # Check for malformed tags first
                    malformed = self.detect_malformed_tags(cell_value)
                    if malformed:
                        for pattern, fix in malformed:
                            results['malformed_tags'].append({
                                'Row': idx,
                                'Column': col,
                                'Pattern': pattern,
                                'Suggested_Fix': fix,
                                'Context': str(cell_value)[:100] + '...' if len(str(cell_value)) > 100 else str(cell_value)
                            })
                    
                    # Find tags
                    tags = self.find_html_tags(cell_value)
                    for tag in tags:
                        is_valid = self.is_valid_html_tag(tag)
                        context = self.get_tag_context(cell_value, tag)
                        results['tag_by_row'].append({
                            'Row': idx,
                            'Column': col,
                            'Tag': tag,
                            'Is_Valid_HTML': is_valid,
                            'Context': context
                        })
        
        # Create summaries
        if results['tag_by_row']:
            tag_by_row_df = pd.DataFrame(results['tag_by_row'])
            
            # Tag summary
            tag_counts = Counter([item['Tag'] for item in results['tag_by_row']])
            tag_locations = defaultdict(list)
            
            for item in results['tag_by_row']:
                tag_locations[item['Tag']].append(f"Row {item['Row']}, Col {item['Column']}")
            
            for tag, count in tag_counts.items():
                first_occurrence = next(item for item in results['tag_by_row'] if item['Tag'] == tag)
                results['tag_summary'].append({
                    'Tag': tag,
                    'Count': count,
                    'Is_Valid_HTML': first_occurrence['Is_Valid_HTML'],
                    'Locations': '; '.join(tag_locations[tag][:5]) + ('...' if len(tag_locations[tag]) > 5 else ''),
                    'Sample_Context': first_occurrence['Context']
                })
            
            tag_summary_df = pd.DataFrame(results['tag_summary']).sort_values('Count', ascending=False)
        else:
            tag_by_row_df = pd.DataFrame()
            tag_summary_df = pd.DataFrame()
        
        malformed_df = pd.DataFrame(results['malformed_tags']) if results['malformed_tags'] else pd.DataFrame()
        
        return {
            'HTML_Tags_by_Row': tag_by_row_df,
            'HTML_Tags_Summary': tag_summary_df,
            'Malformed_Tags': malformed_df
        }
    
    def is_valid_html_tag(self, tag: str) -> bool:
        """Check if a tag is a valid HTML tag with better error handling"""
        try:
            # Handle malformed tags better
            if '<//' in tag or '><' in tag:  # Clearly malformed
                return False
            
            # Remove < > and any attributes, get just the tag name
            clean_tag = re.sub(r'^</?([^>\s/]+).*>$', r'\1', tag).lower()
            
            # Additional check for malformed tags
            if '/' in clean_tag or '<' in clean_tag or '>' in clean_tag:
                return False
                
            return clean_tag in self.html_tags
        except:
            return False
    
    def get_tag_context(self, text: str, tag: str, context_chars: int = 50) -> str:
        """Get context around a tag occurrence"""
        if pd.isna(text) or text == '':
            return ''
        
        text_str = str(text)
        tag_pos = text_str.find(tag)
        if tag_pos == -1:
            return ''
        
        start = max(0, tag_pos - context_chars)
        end = min(len(text_str), tag_pos + len(tag) + context_chars)
        context = text_str[start:end]
        
        # Mark the tag in the context
        tag_in_context = context.replace(tag, f"[[[{tag}]]]")
        return tag_in_context

In [12]:
html_analyzer = HTMLTagAnalyzer()
html_results = html_analyzer.analyze_html_tags_in_dataframe(df)

html_results['HTML_Tags_Summary'].to_sql('html_tag_analysis', conn, if_exists='replace', index=False)
html_results['Malformed_Tags'].to_sql('malformed_tags', conn, if_exists='replace', index=False)

# save_intermediate_stage_sqlite(df, "02_htmltag_analysis", conn)
save_to_excel(html_results, "02_html_tag_analysis.xlsx")

Saved to: working_files\02_html_tag_analysis.xlsx


## Step 4: Malformed Tag Detection and Repair

In [13]:
class MalformedTagRepairer:
    def __init__(self):
        self.html_analyzer = HTMLTagAnalyzer()

    def find_malformed_tags(self, text: str) -> List[str]:
        """Find tags that look like HTML but are malformed"""
        if pd.isna(text) or text == '':
            return []

        text_str = str(text)
        all_tags = self.html_analyzer.find_html_tags(text_str)
        malformed_tags = []

        for tag in all_tags:
            if not self.html_analyzer.is_valid_html_tag(tag):
                if self.is_close_to_html(tag):
                    malformed_tags.append(tag)

        return malformed_tags

    def is_close_to_html(self, tag: str) -> bool:
        """Check if a malformed tag is close to valid HTML"""
        malformed_patterns = [
            r'<[^>]*</[^>]*>',  # Mixed opening/closing
            r'</[^>]*<[^>]*>',  # Reversed brackets
            r'<[^>]*<[^>]*>',   # Double opening
            r'<[^/>][^>]*[^/]>$', # Missing closing slash or improper format
        ]

        for pattern in malformed_patterns:
            if re.search(pattern, tag):
                return True
        return False

    def suggest_repair(self, tag: str) -> str:
        """Suggest a repair for a malformed tag"""
        tag = tag.strip()

        # Common repairs
        if tag.endswith('</bibbl>'):
            return '</bibl>'
        elif tag.endswith('</p</bibl>'):
            return '</p>'
        elif 'bibl' in tag.lower() and not tag.startswith('<bibl'):
            return '<bibl>' if not tag.startswith('</') else '</bibl>'
        elif re.match(r'</?p[^>]*>$', tag, re.IGNORECASE):
            return '<p>' if not tag.startswith('</') else '</p>'

        # Fix multiple brackets
        if tag.count('<') > 1 or tag.count('>') > 1:
            cleaned = re.sub(r'<+', '<', tag)
            cleaned = re.sub(r'>+', '>', cleaned)
            return cleaned

        return tag

    def analyze_malformed_tags(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Analyze malformed tags across DataFrame"""
        results = {
            'malformed_by_row': [],
            'malformed_summary': []
        }

        # Track malformed tags by row
        for idx, row in df.iterrows():
            for col in df.columns:
                cell_value = row[col]
                if pd.notna(cell_value) and cell_value != '':
                    malformed_tags = self.find_malformed_tags(cell_value)
                    for tag in malformed_tags:
                        context = self.html_analyzer.get_tag_context(cell_value, tag)
                        suggested_repair = self.suggest_repair(tag)
                        results['malformed_by_row'].append({
                            'Row': idx,
                            'Column': col,
                            'Malformed_Tag': tag,
                            'Suggested_Repair': suggested_repair,
                            'Context': context
                        })

        # Create summary
        tag_counts = Counter([item['Malformed_Tag'] for item in results['malformed_by_row']])
        tag_locations = defaultdict(list)

        for item in results['malformed_by_row']:
            tag_locations[item['Malformed_Tag']].append(f"Row {item['Row']}, Col {item['Column']}")

        for tag, count in tag_counts.items():
            first_occurrence = next(item for item in results['malformed_by_row'] if item['Malformed_Tag'] == tag)
            results['malformed_summary'].append({
                'Malformed_Tag': tag,
                'Count': count,
                'Suggested_Repair': first_occurrence['Suggested_Repair'],
                'Locations': '; '.join(tag_locations[tag][:5]) + ('...' if len(tag_locations[tag]) > 5 else ''),
                'Sample_Context': first_occurrence['Context']
            })

        # Convert to DataFrames
        malformed_by_row_df = pd.DataFrame(results['malformed_by_row'])
        malformed_summary_df = pd.DataFrame(results['malformed_summary']).sort_values('Count', ascending=False)

        return {
            'Malformed_Tags_by_Row': malformed_by_row_df,
            'Malformed_Tags_Summary': malformed_summary_df
        }

    def repair_tags(self, df: pd.DataFrame, tag_to_repair: str, replacement: str,
               scope: str = 'global', specific_column: str = None,
               specific_row: int = None) -> pd.DataFrame:
        """Repair malformed tags in DataFrame"""
        df_repaired = df.copy()
        
        print("." * 40)
        print(f"Repairing: '{tag_to_repair}' → '{replacement}'")
        print(">" * 40)
        
        if scope == 'global':
            replacements_made = 0
            for col in df_repaired.columns:
                # Count before
                before_count = df_repaired[col].astype(str).str.contains(
                    re.escape(tag_to_repair), regex=True
                ).sum()
                
                # Make replacement - DON'T escape the replacement
                df_repaired[col] = df_repaired[col].astype(str).str.replace(
                    tag_to_repair, replacement, regex=False  # Use literal replacement
                )
                
                # Count after
                after_count = df_repaired[col].astype(str).str.contains(
                    re.escape(tag_to_repair), regex=True
                ).sum()
                
                column_replacements = before_count - after_count
                if column_replacements > 0:
                    print(f"\tColumn '{col}': {column_replacements} replacements")
                    # Show sample context with actual replacement
                    sample_rows = df_repaired[col].astype(str).str.contains(
                        re.escape(replacement), regex=True
                    )
                    if sample_rows.any():
                        sample_idx = sample_rows.idxmax()
                        context = self.html_analyzer.get_tag_context(
                            df_repaired.loc[sample_idx, col], replacement
                        )
                        print(f"\t\tSample context: {context[:100]}...")
                
                replacements_made += column_replacements
                
            print(f"Total replacements made: {replacements_made}")
            
        elif scope == 'column' and specific_column:
            if specific_column in df_repaired.columns:
                df_repaired[specific_column] = df_repaired[specific_column].astype(str).str.replace(
                    tag_to_repair, replacement, regex=False
                )
        elif scope == 'row' and specific_row is not None:
            for col in df_repaired.columns:
                if pd.notna(df_repaired.loc[specific_row, col]):
                    cell_value = str(df_repaired.loc[specific_row, col])
                    df_repaired.loc[specific_row, col] = cell_value.replace(tag_to_repair, replacement)
        elif scope == 'cell' and specific_column and specific_row is not None:
            if specific_column in df_repaired.columns and pd.notna(df_repaired.loc[specific_row, specific_column]):
                cell_value = str(df_repaired.loc[specific_row, specific_column])
                df_repaired.loc[specific_row, specific_column] = cell_value.replace(tag_to_repair, replacement)
        
        return df_repaired

In [14]:
print("Step 4: Analyzing malformed tags...")

malformed_repairer = MalformedTagRepairer()
malformed_results = malformed_repairer.analyze_malformed_tags(df)

print("Malformed Tags Summary:")
if not malformed_results['Malformed_Tags_Summary'].empty:
    display(malformed_results['Malformed_Tags_Summary'])
    
    # Save to SQLite
    malformed_results['Malformed_Tags_Summary'].to_sql(
        'malformed_tags_summary', conn, if_exists='replace', index=False
    )
    malformed_results['Malformed_Tags_by_Row'].to_sql(
        'malformed_tags_by_row', conn, if_exists='replace', index=False
    )
    
    # Apply repairs automatically
    print("\nApplying repairs...")
    for _, row in malformed_results['Malformed_Tags_Summary'].iterrows():
        malformed_tag = row['Malformed_Tag']
        suggested_repair = row['Suggested_Repair']
        if malformed_tag != suggested_repair:
            print(f"Repairing '{malformed_tag}' -> '{suggested_repair}'")
            df = malformed_repairer.repair_tags(
                df, malformed_tag, suggested_repair, scope='global'
            )
else:
    print("No malformed tags found!")

# Save to Excel and SQLite
save_to_excel(malformed_results, "04_malformed_tag_analysis.xlsx")
save_intermediate_stage_sqlite(df, "04_malformed_repair", conn)

Step 4: Analyzing malformed tags...
Malformed Tags Summary:


Unnamed: 0,Malformed_Tag,Count,Suggested_Repair,Locations,Sample_Context
3,<http://www2.potsdam.edu/schwaljf/Nahuatl/flor...,17,<http://www2.potsdam.edu/schwaljf/Nahuatl/flor...,"Row 1429, Col Principal English Translation; R...",". Joe Campbell, Florentine Codex Vocabulary, 1..."
11,"<TLACOOCELUTL: [...] yoan qujtocaiotia, tlacom...",3,"<TLACOOCELUTL: [...] yoan qujtocaiotia, tlacom...","Row 5139, Col Attestations from sources in Eng...","tl*, as the following passage (FC 11: 3) impli..."
5,<Concanauhtli>,3,<Concanauhtli>,"Row 2158, Col Attestations from sources in Eng...",nauhtli (Zoquicanauhtli) is the same as the go...
20,<ue>,3,<ue>,"Row 10787, Col Alonso de Molina; Row 16924, Co...",ret. onitemiquiztlapopolhui.) perdonar la muer...
37,"<with synonyms acoyotl, atotlin>",2,"<with synonyms acoyotl, atotlin>","Row 31289, Col Attestations from sources in En...",onym atapalcatl> is contrasted with “Ateponazt...
...,...,...,...,...,...
60,"<Canyon towhee, Ilamatototl>",1,"<Canyon towhee, Ilamatototl>","Row 31361, Col Attestations from sources in En...",latvicicitli] “It is the same as the brown tow...
61,<XIUH-TŌTŌ-TL>,1,<XIUH-TŌTŌ-TL>,"Row 31373, Col Attestations from sources in En...","is entirely, completely light blue like a coti..."
62,"<ĀCAL-LI, ""boat"">",1,"<ĀCAL-LI, ""boat"">","Row 31375, Col Attestations from sources in En...","ion of ""tenalcaltic""]; rather, ""boat-shaped bi..."
63,<boat-shaped>,1,<boat-shaped>,"Row 31375, Col Attestations from sources in En...",k mingled with white. The Bill widens; it beco...



Applying repairs...
Repairing '<?bibl>' -> '<bibl>'
........................................
Repairing: '<?bibl>' → '<bibl>'
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	Column 'Attestations from sources in Spanish': 1 replacements
		Sample context: liz = no se la quite nadie (Tlaxcala, 1609)<br /> [[[<bibl>]]]Vidas y bienes olvidados: Testamentos ...
Total replacements made: 1
Repairing '<Prairie>' -> '<p>'
........................................
Repairing: '<Prairie>' → '<p>'
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	Column 'Attestations from sources in English': 1 replacements
		Sample context: [[[<p>]]]acaçomo iuhqui yez yn anoço yuhquiez = whether it ...
Total replacements made: 1
Saved to: working_files\04_malformed_tag_analysis.xlsx
Saved to SQLite table: 04_malformed_repair_stage


Unnamed: 0,Ref,Headword,Orthographic Variants,Principal English Translation,Attestations from sources in English,Attestations from sources in Spanish,Alonso de Molina,Frances Karttunen,Horacio Carochi / English,Andrés de Olmos,Lockhart’s Nahuatl as Written,themes,Spanish Loanword
0,WHP-171879,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No
1,WHP-171881,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,"<p>Ayac. ninguno, o nadie o estar alguno ausen...","<p>AYĀC no one / ninguno, o nadie (M) See AH-,...","<p>ayāc = no one<br /> <bibl>Horacio Carochi, ...",,"<p>no one; nobody; or, for someone to be absen...",,No
2,WHP-171882,acan.,,"<p>nowhere, no place (see Molina, Karttunen, L...",,,<p>acan. en ninguna parte o lugar. aduerbio.<b...,<p>AHCĀN nowhere / en ninguna parte o lugar (M...,"<p>àcān = nowhere<br /> <bibl>Horacio Carochi,...","<p>en ningun lugar, por, de, etc.<br /> <bibl>...",<p>ahcān = (particle) nowhere<br /> <bibl>Jame...,"Cardinal Directions, Cosmos",No
3,WHP-171883,acampa.,,<p>from nowhere; in no way; neither from one p...,,,"<p>Acampa. de ninguna parte, o ni a vna parte ...",,<p>àcāmpa = nowhere<br /> <bibl>Horacio Caroch...,,,"Cardinal Directions, Cosmos",No
4,WHP-171884,acatto.,"acattopa, yacatopa",<p>first (see Karttunen)</p>,"<p>YACATTO, YACATTOPA = first / primero<br /> ...",,,,,,,"Numbers, Math",No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31801,WHP-211066,Teyahualco.,,"<p>a place name; e.g., the town of Santiago Te...",<p>The place name components: <em>te</em>- sto...,,,,,,,,No
31802,WHP-211067,chinancocol.,,<p>a type of agricultural field--wavy? with ri...,,,,,,,,,No
31803,WHP-211068,poyahuallotl.,,<p>a feather that covers the bird's tail feath...,,"<p>""La pluma que tienen las aues cerca de la c...",,,,,,,No
31804,WHP-211069,Cuauhquiyahuacatl.,,<p>a title given to a principal for distinguis...,,"<p>""Mecatzin. Uno de los valerosos ""soldados c...",,,,,,,No


## Step 5: Non-HTML Tag Detection

## Step 6: Citation Extraction

## Step 7: Cross-Reference Extraction

## Step 8: Complete Workflow Example

## Individual Processing Functions

## Usage Examples

## DIY Data Clean-Up