In [53]:
# Nahuatl Dictionary Data Analysis Notebook
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from bs4 import BeautifulSoup


In [54]:
csv_file = r"C:\Users\Kevin\Downloads\NahuatLEX\data\initial_dictionaries\WHP_EarlyNahuatl_data_2024-03-26T17-22-58.csv"
df = pd.read_csv(csv_file)

In [55]:
# 1. Basic Data Exploration
print("Dataset Basic Information:")
print("-" * 50)
print(f"Total number of entries: {len(df)}")
print("\nColumn Types:")
print(df.dtypes)

Dataset Basic Information:
--------------------------------------------------
Total number of entries: 31806

Column Types:
Ref                                     object
Headword                                object
Orthographic Variants                   object
Principal English Translation           object
Attestations from sources in English    object
Attestations from sources in Spanish    object
Alonso de Molina                        object
Frances Karttunen                       object
Horacio Carochi / English               object
Andrés de Olmos                         object
Lockhart’s Nahuatl as Written           object
themes                                  object
Spanish Loanword                        object
dtype: object


In [56]:
# 2. Missing Values Analysis
print("\nMissing Values per Column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


Missing Values per Column:
Orthographic Variants                   17398
Principal English Translation            7751
Attestations from sources in English    25938
Attestations from sources in Spanish    28594
Alonso de Molina                         9291
Frances Karttunen                       25380
Horacio Carochi / English               31396
Andrés de Olmos                         31700
Lockhart’s Nahuatl as Written           30552
themes                                  18995
dtype: int64


In [57]:
# 3. Source Richness Analysis
source_columns = [
    'Attestations from sources in English', 
    'Attestations from sources in Spanish', 
    'Alonso de Molina', 
    'Frances Karttunen', 
    'Horacio Carochi / English', 
    'Andrés de Olmos', 
    'Lockhart’s Nahuatl as Written'
]

# Count of non-empty entries for each source
source_richness = {col: df[col].notna().sum() for col in source_columns}
print("\nSource Richness (Number of Non-Empty Entries):")
for source, count in source_richness.items():
    print(f"{source}: {count} entries")



Source Richness (Number of Non-Empty Entries):
Attestations from sources in English: 5868 entries
Attestations from sources in Spanish: 3212 entries
Alonso de Molina: 22515 entries
Frances Karttunen: 6426 entries
Horacio Carochi / English: 410 entries
Andrés de Olmos: 106 entries
Lockhart’s Nahuatl as Written: 1254 entries


In [58]:
# 4. Themes Analysis
print("\nThemes Analysis:")
# Split themes and count their occurrences
df['themes_list'] = df['themes'].fillna('').str.split(';')
all_themes = [theme.strip() for themes in df['themes_list'] for theme in themes if theme.strip()]
theme_counts = pd.Series(all_themes).value_counts()


print("Top 10 Themes:")
print(theme_counts.head(10))


Themes Analysis:
Top 10 Themes:
Health, Sickness, Wellness, Medicine            523
Numbers, Math                                   518
Time, Calendar, Frequency                       454
Anatomy                                         437
Animals, Bugs, Insects, Reptiles, Fish, etc.    408
Religion (Christian)                            365
Emotions                                        365
Behavior, Personality                           301
Food, Eating, Cooking                           295
Plants, Trees, Flora                            273
Name: count, dtype: int64


In [59]:
# 5. Spanish Loanwords
print("\nSpanish Loanwords:")
loanword_counts = df['Spanish Loanword'].value_counts(normalize=True) * 100
print(loanword_counts)



Spanish Loanwords:
Spanish Loanword
No     95.469408
Yes     4.530592
Name: proportion, dtype: float64


In [60]:
# 6. Orthographic Variants
df['num_variants'] = df['Orthographic Variants'].fillna('').str.split(';').apply(len)
print("\nOrthographic Variants Distribution:")
print(df['num_variants'].value_counts().sort_index())



Orthographic Variants Distribution:
num_variants
1    31797
2        8
3        1
Name: count, dtype: int64


In [61]:

# Visualization Function
def create_visualizations():
    plt.figure(figsize=(20, 15))
    
    # Source Attestations
    plt.subplot(2, 2, 1)
    plt.bar(source_richness.keys(), source_richness.values())
    plt.title('Source Attestations')
    plt.xticks(rotation=45, ha='right')
    
    # Themes
    plt.subplot(2, 2, 2)
    theme_counts.head(10).plot(kind='bar')
    plt.title('Top 10 Themes')
    plt.xticks(rotation=45, ha='right')
    
    # Loanwords Pie Chart
    plt.subplot(2, 2, 3)
    loanword_counts.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Spanish Loanwords')
    
    # Orthographic Variants
    plt.subplot(2, 2, 4)
    df['num_variants'].value_counts().sort_index().plot(kind='bar')
    plt.title('Orthographic Variants per Entry')
    
    plt.tight_layout()
    plt.savefig('nahuatl_dictionary_overview.png')
    plt.close()

# Run visualizations
create_visualizations()
print("\nVisualization saved as 'nahuatl_dictionary_overview.png'")



Visualization saved as 'nahuatl_dictionary_overview.png'
