In [1]:
import pandas as pd
import numpy as np
import hashlib
from typing import List, Dict
from pathlib import Path
from datetime import datetime
import os
import glob
import csv
import re
from inscriptis import get_text
from bs4 import BeautifulSoup
import sqlite3

In [2]:
# load in the SQLite database holding the WHP Dataset
conn = sqlite3.connect('../../data/sqLiteDb/Whp_Raw_Dataset.db')
table_name = "WHP_EarlyNahuatl_Data"

tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = pd.read_sql(tables_query, conn)
tables


# If there's issues check the following
# Possible solutions:
# 1. Ensure the db file is in the correct directory
# 2. Check the exact filename
# 3. Verify the file extension

Unnamed: 0,name
0,WHP_EarlyNahuatl_Data
1,WHP_EarlyNahuatl_Deduplicated


In [16]:
query = "SELECT * FROM WHP_EarlyNahuatl_Data LIMIT 3;"  # Limiting to 10 rows for demonstration
whp_dataset = pd.read_sql(query, conn)
display(whp_dataset)

Unnamed: 0,Ref,Headword,Orthographic Variants,Principal English Translation,Attestations from sources in English,Attestations from sources in Spanish,Alonso de Molina,Frances Karttunen,Horacio Carochi / English,Andrés de Olmos,Lockhart’s Nahuatl as Written,themes,Spanish Loanword
0,WHP-171879,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No
1,WHP-171881,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,"<p>Ayac. ninguno, o nadie o estar alguno ausen...","<p>AYĀC no one / ninguno, o nadie (M) See AH-,...","<p>ayāc = no one<br /> <bibl>Horacio Carochi, ...",,"<p>no one; nobody; or, for someone to be absen...",,No
2,WHP-171882,acan.,,"<p>nowhere, no place (see Molina, Karttunen, L...",,,<p>acan. en ninguna parte o lugar. aduerbio.<b...,<p>AHCĀN nowhere / en ninguna parte o lugar (M...,"<p>àcān = nowhere<br /> <bibl>Horacio Carochi,...","<p>en ningun lugar, por, de, etc.<br /> <bibl>...",<p>ahcān = (particle) nowhere<br /> <bibl>Jame...,"Cardinal Directions, Cosmos",No


In [27]:
cursor = conn.execute(f"PRAGMA table_info({table_name})")
columns_info = cursor.fetchall()
column_names = [col[1] for col in columns_info]

print(column_names)

['Ref', 'Headword', 'Orthographic Variants', 'Principal English Translation', 'Attestations from sources in English', 'Attestations from sources in Spanish', 'Alonso de Molina', 'Frances Karttunen', 'Horacio Carochi / English', 'Andrés de Olmos', 'Lockhart’s Nahuatl as Written', 'themes', 'Spanish Loanword']


Working with SQLite and Pandas may be a bit tricky but there's several options you can do to leverage both tools. 

Option 1: Load from SQLite into a DataFrame (Minimal changes)
```python
# Connect to SQLite and load the table
conn = sqlite3.connect("your_database.db")
df = pd.read_sql("SELECT * FROM WHP_EarlyNahuatl_Data", conn)
# Now your existing code works exactly the same
```

Option 2: Push logic into SQL (depending on the use case)


In [28]:
df = pd.read_sql("SELECT * FROM WHP_EarlyNahuatl_Data", conn)

def compare_attestations(attestation_value, original_entry_value, match_type='substring'):
    """
    Compare attestation values with original entry values
    
    Args:
        attestation_value (str): Value from attestation column
        original_entry_value (str): Value from original entry column
        match_type (str): 'substring' or 'exact'
    
    Returns:
        list: Matching attestations
    """
    # handle NaN or empty values
    if pd.isna(attestation_value) or pd.isna(original_entry_value):
        return []
    
    # split attestations and strip whitespace
    attestations = [att.strip() for att in str(attestation_value).split(';') if att.strip()]
    
    # matching logic based on match type
    if match_type == 'exact':
        matching_attestations = [
            att for att in attestations 
            if att == str(original_entry_value).strip()
        ]
    else:  # substring
        matching_attestations = [
            att for att in attestations 
            if att in str(original_entry_value).strip()
        ]
    
    return matching_attestations

# define match types (you can choose)
MATCH_TYPE = 'exact'  # or 'substring'

attestation_columns = [
    'Attestations from sources in English',
    'Attestations from sources in Spanish'
]
original_entry_columns = [
    'Alonso de Molina',
    'Frances Karttunen',
    'Horacio Carochi / English',
    'Andrés de Olmos',
    'Lockhart’s Nahuatl as Written'
]
   
validation_results = {}
detailed_matching = {}

for attestation_column in attestation_columns:
    column_results = {
        'source_matching': {}
    }
    column_detailed_matching = {}
   
    # total rows in the dataset
    total_rows = len(df)
   
    # rows with non-empty attestations
    non_empty_attestations = df[df[attestation_column].notna() & (df[attestation_column] != '')]
   
    # store overall attestation statistics
    column_results['total_rows'] = total_rows
    column_results['non_empty_attestations'] = len(non_empty_attestations)
   
    # check matching for each original entry column
    for original_entry_column in original_entry_columns:
        # Create a list to store detailed matching information
        detailed_match_rows = []
        
        # Iterate through each row with non-empty attestations
        for _, row in non_empty_attestations.iterrows():
            # Compare attestations
            matching_attestations = compare_attestations(
                row[attestation_column], 
                row[original_entry_column], 
                match_type=MATCH_TYPE
            )
            
            # If any attestations match, store the details
            if matching_attestations:
                detailed_match_rows.append({
                    'Ref': row['Ref'],
                    'Headword': row['Headword'],
                    'Attestation Column': attestation_column,
                    'Original Entry Column': original_entry_column,
                    'Matching Attestations': matching_attestations,
                    'Attestation Value': row[attestation_column],
                    'Original Entry Value': row[original_entry_column]
                })
        
        # collect matching results for this source
        column_results['source_matching'][original_entry_column] = {
            'matching_attestations': len(detailed_match_rows),
            'matching_percentage': len(detailed_match_rows) / total_rows * 100
        }
        
        # store detailed matching rows
        column_detailed_matching[original_entry_column] = detailed_match_rows
   
    # calculate overall matching across all sources
    total_source_matches = sum(
        results['matching_attestations']
        for results in column_results['source_matching'].values()
    )
   
    column_results['total_source_matches'] = total_source_matches
    column_results['total_source_matching_percentage'] = total_source_matches / total_rows * 100
   
    # store results for this attestation column
    validation_results[attestation_column] = column_results
    detailed_matching[attestation_column] = column_detailed_matching

# display detailed matching
print("\nDetailed Matching:")
for attestation_col, sources in detailed_matching.items():
    print(f"\n--- {attestation_col} ---")
    for source, matches in sources.items():
        print(f"\n{source}:")
        if matches:
            for match in matches:
                print("\nMatching Row:")
                for key, value in match.items():
                    print(f"{key}: {value}")
        else:
            print("No matches found")


Detailed Matching:

--- Attestations from sources in English ---

Alonso de Molina:
No matches found

Frances Karttunen:
No matches found

Horacio Carochi / English:
No matches found

Andrés de Olmos:
No matches found

Lockhart’s Nahuatl as Written:
No matches found

--- Attestations from sources in Spanish ---

Alonso de Molina:
No matches found

Frances Karttunen:
No matches found

Horacio Carochi / English:
No matches found

Andrés de Olmos:
No matches found

Lockhart’s Nahuatl as Written:
No matches found


there's no exact matches in the attestation columns in the later sources columns, only substring matches

In [29]:
ref_numrowsduped = {}
ref_nonempty_values = {}

for index, row in df.iterrows():

    non_empty_values = {
        col: row[col] for col in original_entry_columns 
        if row[col] not in ['', np.nan, None]
    }
    
    if len(non_empty_values) > 1:
        # track the number of non-empty values
        ref_numrowsduped[row["Ref"]] = len(non_empty_values)
        
        # track the specific non-empty values
        ref_nonempty_values[row["Ref"]] = non_empty_values

print(f"Total references needing duplication: {len(ref_numrowsduped)}")


Total references needing duplication: 2873


In [30]:
new_rows = []
for index, row in df.iterrows():
    original_ref = row['Ref']
    
    # check if this reference needs duplication
    if original_ref in ref_numrowsduped:
        # get the non-empty values for this reference
        non_empty_values = ref_nonempty_values[original_ref]
        
        # add the original row first with cleared Full Original Entry
        original_row = row.copy()
        original_row['Full Original Entry'] = ''  # Clear full original entry
        original_row['Original Ref'] = original_ref  # Keep track of original reference
        new_rows.append(original_row)
        
        # create a duplicate for each non-empty value
        for col, value in non_empty_values.items():
            new_row = row.copy()
            new_row['Ref'] = f"{original_ref}_{col.replace(' ', '_').lower()}"
            new_row['Original Ref'] = original_ref
            new_row['Full Original Entry'] = value
            new_rows.append(new_row)
    else:
        # if no duplication needed, just add the row once
        row_copy = row.copy()
        row_copy['Original Ref'] = '' # Consistency: add original ref to all rows
        row_copy['Full Original Entry'] = ''    
        new_rows.append(row_copy)
        
print(f"Length of the rows:{len(new_rows)}")


Length of the rows:38480


In [31]:
# #NAME? issue should be resolved with SQLite now so let's upload this dataset to SQLite

# Write to a new table
# if_exists='replace' will overwrite any existing table with the same name
df.to_sql("WHP_EarlyNahuatl_Deduplicated", conn, if_exists='replace', index=False)

# Close the connection
conn.close()

CHECKED THE TABLE IN DB_BROWSER AND NO ISSUE

---

From this section onwards we will be working on doing a sanity check on the content of the WHP dataset. The dataset has html tags in some columns and our task will be to identify what tags can be removed or if we can convert the html format to plain text.

Additionally we need to see if we can extract <bibl> tags into a seperate dataset that has those <bibl> tags as sources. If there is page 
numbers in the cells we would like to add a "Page Number" column into the WHP dataset. Also adding a "Source" column

In [31]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
search_paths = os.path.join(project_root, 'data', 'interim')

interim_path = str(search_paths) + "\WHP_EarlyNahuatl_data_rowsduplicatedv2.csv"
interim_df = pd.read_csv(interim_path, encoding='utf-8-sig')

display(interim_df.head())



Unnamed: 0,Ref,Headword,Orthographic Variants,Principal English Translation,Attestations from sources in English,Attestations from sources in Spanish,Alonso de Molina,Frances Karttunen,Horacio Carochi / English,Andrés de Olmos,Lockhart’s Nahuatl as Written,themes,Spanish Loanword,Full Original Entry,Original Ref
0,WHP-171879,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No,,WHP-171879
1,WHP-171879_alonso_de_molina,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,WHP-171879
2,WHP-171879_frances_karttunen,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,WHP-171879
3,WHP-171879_horacio_carochi_/_english,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,<p>Acaçomo. quiça no. Aduerbio.<br /> <bibl> A...,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,,,,No,<p>àcaçomō = perhaps not<br /> <bibl>Horacio C...,WHP-171879
4,WHP-171881,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,"<p>Ayac. ninguno, o nadie o estar alguno ausen...","<p>AYĀC no one / ninguno, o nadie (M) See AH-,...","<p>ayāc = no one<br /> <bibl>Horacio Carochi, ...",,"<p>no one; nobody; or, for someone to be absen...",,No,,WHP-171881


also in this sense I would drop rows that have "Full Original Entry" values empty since these are the rows that we duplicated from and in this 
sense we don't need to modify this data. The columns we would be working on from now on would be.

*THE ABOVE WAS A MISTAKE BY DROPPING ROWS THAT HAVE "FULL ORIGINAL ENTRY" VALUES
EMPTY I ALSO REMOVED ROWS THAT DID NOT NEED DUPLICATION AND IN THIS SENSE WE LOST DATA* 

"Ref", "Headword", "Orthographic Variants", "Principal English Translation", "Attestations from sources in English", "Attestations from sources in Spanish", "themes", "Spanish Loanword", "Full Original Entry", "Original Ref"

In [28]:
df_processed = interim_df.copy()

# drop the source columns since we now have "Full Original Entry"
source_columns = [
    'Alonso de Molina',
    'Frances Karttunen', 
    'Horacio Carochi / English',
    'Andrés de Olmos',
    "Lockhart’s Nahuatl as Written",
]
df_processed.drop(columns=source_columns, inplace=True)

# create a "Source" column to identify which dictionary the entry came from
def extract_source_from_ref(ref):
    "Extract source name from duplicated reference IDs"
    if '_' in ref and ref.startswith('WHP-'):
        # duplicated row, extract source from the suffix
        source_part = ref.split('_', 1)[1]  # Get everything after first underscore
        return source_part.replace('_', ' ').title()
    else:
        # original row that wasn't duplicated
        return "Original Entry"

df_processed['Source'] = df_processed['Ref'].apply(extract_source_from_ref)

# for rows where "Full Original Entry" is empty, it means they didn't need duplication
# set a flag to distinguish these
df_processed['Entry_Type'] = df_processed['Full Original Entry'].apply(
    lambda x: 'Source_Entry' if pd.notna(x) and x.strip() != '' else 'Original_Entry'
)

print(f"Total rows preserved: {len(df_processed)}")
print(f"Original entries (no duplication needed): {len(df_processed[df_processed['Entry_Type'] == 'Original_Entry'])}")
print(f"Source entries (from duplication): {len(df_processed[df_processed['Entry_Type'] == 'Source_Entry'])}")

Total rows preserved: 38480
Original entries (no duplication needed): 31806
Source entries (from duplication): 6674


In [29]:
def identify_html_tags_based_on_column(column_name, df):
    html_tags = set()
    for value in df[column_name]:
        if not pd.isna(value):
            soup = BeautifulSoup(value, "html.parser")
            for tag in soup.find_all():
                if tag.name not in html_tags:
                    html_tags.add(tag.name)
    return html_tags

In [30]:
columns_to_check_html = [ "Principal English Translation", "Attestations from sources in English" ,
                         "Attestations from sources in Spanish", "Full Original Entry"]

for col in columns_to_check_html:
    print(identify_html_tags_based_on_column(str(col), df_processed))


{'p', 'em', 'del', 'alo-', 'br', 'a', 'bibl<', 'bibl?http:', 'span', 'b', 'john', 'http:', 'ross', 'bibl', 'strong'}
{'p', 'unlike', 'wup', 'tzanatl', 'sandhill', 'zoquiazolin,', 'tlhotli', 'prairie', 'slender-billed', 'canauh-tli', 'b', 'common', 'shorebirds', 'xiuh-tōtō-tl', 'boat-shaped', 'when', 'http:', 'late', 'in', 'fr.', 'em', 'concanauhtli', 'tzoniaiauhqui,', 'bibi', 'a', 'susp', 'with', 'tenitztli', 'tlavitequjnj', 'zōl-in', 'the', 'each', 'amazon', 'bibl', 'canauhtli', 'tlacoocelutl:', 'sup', 'totolin', 'del', 'chīcua-tli', 'bibbl', 'br', 'zoquicanauhtli,', 'tzoniaiauhquj,', 'cozca-cuauh-tli,', 'which', 'for', 'toznene', 'b9bl', 'black-bellied', 'strong', 'sip', 'american', 'aztec', 'that', 'canyon', 'house', 'span', 'of', 'wood', 'duck'}
{'p', 'em', 'rémi', 'sup', 'br', 'a', 'miguel', 'span', 'bibl<em', 'bibl', 'strong'}
{'p', 'em', 'mahuizzoh', 'br', 'a', 'mat(i).', 'te-tl‘stone,’', 'bobl', 'mo-', 'tlāhu(i)-tl', 'tla', 'bibl', 'strong'}


In [None]:
'''
only valid tags that are html are 'em', 'strong', 'a', 'br', 'bibl', 'sup', 'span', and 'p', the rest seem to be formatting errors
in the source code once I looked at the websites source code. The one thing that I feel is a typo is the 'bobl' to 'bibl'. So I changed those
in the original dataset

'''

# I think from this point on I want to extract bibl tags only for the time being.

# function to extract text from bibl tags in a string
def extract_bibl_tags(text):
    if pd.isna(text):
        return None
    
    soup = BeautifulSoup(text, "html.parser")
    bibl_tags = soup.find_all('bibl')
    
    if not bibl_tags:
        return None, text
    
    bibl_content = " | ".join([tag.get_text(strip=True) for tag in bibl_tags])
    for tag in bibl_tags:
        tag.replace_with("")
    
    # return both the extracted bibl content and the modified text
    return bibl_content, str(soup)

# apply the function to each column and combine results
def add_bibl_column(df, columns_to_check):
    df['bibl_content'] = None
    
    # for each row, check each column for bibl tags
    for idx, row in df.iterrows():
        bibl_texts = []
        
        for col in columns_to_check:
            if col in row.index and not pd.isna(row[col]):
                bibl_text, cleaned_text = extract_bibl_tags(row[col])
                df.at[idx, col] = cleaned_text
                if bibl_text:
                    bibl_texts.append(f"{col}: {bibl_text}")
        
        # if any bibl tags were found, update the cell
        if bibl_texts:
            df.at[idx, 'bibl_content'] = " || ".join(bibl_texts)
    
    return df

# apply to your DataFrame
columns_to_check_html = [
    "Principal English Translation", 
    "Attestations from sources in English",
    "Attestations from sources in Spanish", 
    "Full Original Entry"
]

df_temp = add_bibl_column(df_processed, columns_to_check_html)    

display(df_temp.head())
df_temp.to_csv("output.csv", encoding='utf-8-sig')


Unnamed: 0,Ref,Headword,Orthographic Variants,Principal English Translation,Attestations from sources in English,Attestations from sources in Spanish,themes,Spanish Loanword,Full Original Entry,Original Ref,bibl_content
1,WHP-171879_alonso_de_molina,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,,No,<p>Acaçomo. quiça no. Aduerbio.<br/> </p>,WHP-171879,Attestations from sources in English: Beyond t...
2,WHP-171879_frances_karttunen,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,,No,<p>AHCAZOMŌ perhaps not / quizá no (M). In on...,WHP-171879,Attestations from sources in English: Beyond t...
3,WHP-171879_horacio_carochi_/_english,acazomo.,"accaçomo, acaçomo",<p>perhaps not (adverb) (see Molina)</p>,<p>acaçomo iuhqui yez yn anoço yuhquiez = whet...,,,No,<p>àcaçomō = perhaps not<br/> </p>,WHP-171879,Attestations from sources in English: Beyond t...
5,WHP-171881_alonso_de_molina,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,,No,"<p>Ayac. ninguno, o nadie o estar alguno ausen...",WHP-171881,Attestations from sources in English: James Lo...
6,WHP-171881_frances_karttunen,ayac.,aiaac,"<p>no one; nobody; or, for someone to be absen...",<p>aiaac mic in mexica = None of the Mexica di...,<p>ayac guincuiliz = no se la quite nadie (Tla...,,No,"<p>AYĀC no one / ninguno, o nadie (M) See AH-,...",WHP-171881,Attestations from sources in English: James Lo...


so as of now we've removed <bibl> tags from the columns that have html content and now we're going to attempt to remove/conver the html structure to text structure