# ትግርኛ - Enhanced Tigrigna Spell Checker

This notebook provides an enhanced real-time spell checker for the Tigrigna language with improved suggestion features and dictionary management.

## 1. Setup and Imports

In [1]:
# Import necessary libraries
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import re
import os
import sys
from pathlib import Path
from collections import Counter
import heapq

# Add the current directory to the path if needed
current_dir = Path.cwd()
if str(current_dir) not in sys.path:
    sys.path.append(str(current_dir))

print("Setup complete.")

Setup complete.


## 2. Load Tigrigna Dictionary

In [2]:
def load_dictionary():
    """Load the Tigrigna dictionary from file"""
    dictionary = set()
    # Try multiple potential locations for the dictionary
    potential_paths = [
        'tigrigna_dictionary.txt',
        'data/tigrigna_words.txt',
        'data/tigrigna_dictionary.txt',
        os.path.join(os.path.dirname(os.getcwd()), 'data', 'tigrigna_words.txt')
    ]
    
    for path in potential_paths:
        try:
            with open(path, 'r', encoding='utf-8') as file:
                words = {line.strip() for line in file if line.strip()}
                if words:
                    dictionary.update(words)
                    print(f"Loaded {len(words)} words from {path}")
        except (FileNotFoundError, IOError):
            continue
    
    if not dictionary:
        # Fallback to a small set of common Tigrigna words
        dictionary = {'ሰላም', 'ከመይ', 'ጽቡቕ', 'ሓጺር', 'ነዊሕ', 'ዓቢ', 'ንእሽቶ'}
        print("Warning: Could not find dictionary file. Using a minimal set of words.")
    
    print(f"Total dictionary size: {len(dictionary)} words")
    return dictionary

# Load the dictionary
tigrigna_dictionary = load_dictionary()

Loaded 294 words from tigrigna_dictionary.txt
Loaded 294 words from data/tigrigna_words.txt
Total dictionary size: 294 words


## 3. Spell Checking Functions

In [3]:
def check_word(word, dictionary):
    """Check if a word is in the dictionary"""
    return word in dictionary

def tokenize_text(text):
    """Split Tigrigna text into words"""
    # Replace Ethiopic punctuation with spaces
    cleaned_text = re.sub(r'[፡።፣፤፥፧፦፨፠፟]', ' ', text)
    words = re.split(r'\s+', cleaned_text)
    return [word for word in words if word]

def edit_distance(s1, s2):
    """Calculate Levenshtein distance between two strings"""
    if len(s1) < len(s2):
        return edit_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def get_suggestions(word, dictionary, max_distance=2, max_suggestions=5):
    """Get spelling suggestions for a word"""
    if not word or check_word(word, dictionary):
        return []
    
    suggestions = []
    for dict_word in dictionary:
        distance = edit_distance(word, dict_word)
        if distance <= max_distance:
            suggestions.append((dict_word, distance))
    
    # Sort by edit distance (closest first)
    suggestions.sort(key=lambda x: x[1])
    return [suggestion[0] for suggestion in suggestions[:max_suggestions]]

# Test the functions
test_word = "ሰላም"
print(f"Is '{test_word}' in dictionary? {check_word(test_word, tigrigna_dictionary)}")

test_misspelled = "ሰላምም"
suggestions = get_suggestions(test_misspelled, tigrigna_dictionary)
print(f"Suggestions for '{test_misspelled}': {suggestions}")

Is 'ሰላም' in dictionary? True
Suggestions for 'ሰላምም': ['ሰላም', 'ሰላማት']


## 4. Advanced Suggestion Algorithms

In [4]:
def character_ngrams(word, n=2):
    """Generate character n-grams for a word"""
    return [word[i:i+n] for i in range(len(word)-n+1)]

def build_ngram_index(dictionary, n=2):
    """Build an index of n-grams to words for quick lookup"""
    ngram_index = {}
    for word in dictionary:
        for ngram in character_ngrams(word, n):
            if ngram not in ngram_index:
                ngram_index[ngram] = set()
            ngram_index[ngram].add(word)
    return ngram_index

def get_enhanced_suggestions(word, dictionary, ngram_index, max_suggestions=5):
    """Get suggestions using n-gram similarity"""
    if not word or check_word(word, dictionary):
        return []
    
    # Find candidate words based on shared n-grams
    candidates = Counter()
    for ngram in character_ngrams(word):
        if ngram in ngram_index:
            for candidate in ngram_index[ngram]:
                candidates[candidate] += 1
    
    # No candidates found, fall back to edit distance
    if not candidates:
        return get_suggestions(word, dictionary, max_suggestions=max_suggestions)
    
    # Calculate edit distance for top candidates
    top_candidates = heapq.nlargest(max_suggestions * 2, candidates.items(), key=lambda x: x[1])
    results = [(candidate, edit_distance(word, candidate)) for candidate, _ in top_candidates]
    
    # Sort by edit distance
    results.sort(key=lambda x: x[1])
    return [result[0] for result in results[:max_suggestions]]

# Build the n-gram index
print("Building n-gram index...")
ngram_index = build_ngram_index(tigrigna_dictionary)
print(f"Built n-gram index with {len(ngram_index)} n-grams")

# Test enhanced suggestions
test_misspelled = "ሰላምም"
suggestions = get_enhanced_suggestions(test_misspelled, tigrigna_dictionary, ngram_index)
print(f"Enhanced suggestions for '{test_misspelled}': {suggestions}")

Building n-gram index...
Built n-gram index with 564 n-grams
Enhanced suggestions for 'ሰላምም': ['ሰላም', 'ሰላማት', 'ምምሕዳር']


## 5. Real-time Spell Checker UI

In [5]:
def highlight_misspelled(text, dictionary, ngram_index):
    """Highlight misspelled words with suggestions"""
    if not text.strip():
        return ""
    
    words = tokenize_text(text)
    result = []
    
    for i, word in enumerate(words):
        if not word:
            continue
        
        if not check_word(word, dictionary):
            suggestions = get_enhanced_suggestions(word, dictionary, ngram_index)
            suggestions_text = ", ".join(suggestions) if suggestions else "No suggestions"
            result.append(f'<span style="color: red; text-decoration: underline;" title="{suggestions_text}">{word}</span>')
        else:
            result.append(word)
        
        # Add spaces between words
        if i < len(words) - 1:
            result.append(" ")
    
    return "".join(result)

def on_text_change(change):
    """Handle text changes for real-time spell checking"""
    text = change['new']
    with output_area:
        clear_output()
        if text.strip():
            highlighted = highlight_misspelled(text, tigrigna_dictionary, ngram_index)
            display(HTML(f"<div style='font-size: 16px; line-height: 1.5;'>{highlighted}</div>"))
            
            # Show statistics
            words = tokenize_text(text)
            misspelled = [word for word in words if word and not check_word(word, tigrigna_dictionary)]
            stats_html = f"<div style='margin-top: 15px; padding: 10px; background: #f5f5f5; border-radius: 5px;'>"
            stats_html += f"<b>Total words:</b> {len(words)} | "
            stats_html += f"<b>Misspelled:</b> {len(misspelled)} | "
            stats_html += f"<b>Correct:</b> {len(words) - len(misspelled)}"
            stats_html += "</div>"
            display(HTML(stats_html))
        else:
            display(HTML("<p>Spell check results will appear here as you type...</p>"))

# Create the UI components
title = widgets.HTML("<h2 style='color:#3f51b5;text-align:center;'>ትግርኛ - Enhanced Tigrigna Spell Checker</h2>")
instructions = widgets.HTML("<p style='text-align:center;'>Type or paste Tigrigna text below. Spelling suggestions appear instantly!</p>")

text_input = widgets.Textarea(
    placeholder='ኣብዚ ትግርኛ ጽሑፍካ ጽሓፍ...',
    layout=widgets.Layout(width='100%', height='150px')
)

output_area = widgets.Output()

# Register the callback
text_input.observe(on_text_change, names='value')

# Display the UI
display(title)
display(instructions)
display(text_input)
display(output_area)

# Initialize the output area
with output_area:
    display(HTML("<p>Spell check results will appear here as you type...</p>"))

HTML(value="<h2 style='color:#3f51b5;text-align:center;'>ትግርኛ - Enhanced Tigrigna Spell Checker</h2>")

HTML(value="<p style='text-align:center;'>Type or paste Tigrigna text below. Spelling suggestions appear insta…

Textarea(value='', layout=Layout(height='150px', width='100%'), placeholder='ኣብዚ ትግርኛ ጽሑፍካ ጽሓፍ...')

Output()

## 6. Word Addition Feature

In [6]:
def add_word_to_dictionary(word):
    """Add a word to the Tigrigna dictionary"""
    global tigrigna_dictionary, ngram_index
    
    if not word.strip():
        return "Please enter a word to add."
    
    word = word.strip()
    if word in tigrigna_dictionary:
        return f"'{word}' is already in the dictionary."
    
    # Add to in-memory dictionary
    tigrigna_dictionary.add(word)
    
    # Update n-gram index
    for ngram in character_ngrams(word):
        if ngram not in ngram_index:
            ngram_index[ngram] = set()
        ngram_index[ngram].add(word)
    
    # Try to save to file
    save_path = 'data/tigrigna_words.txt'
    try:
        with open(save_path, 'a', encoding='utf-8') as f:
            f.write(f"\n{word}")
        return f"Added '{word}' to the dictionary and saved to {save_path}."
    except Exception as e:
        return f"Added '{word}' to the in-memory dictionary, but could not save to file: {str(e)}"

def on_add_button_click(b):
    """Handle the add word button click"""
    with add_output:
        clear_output()
        result = add_word_to_dictionary(add_word_input.value)
        display(widgets.HTML(f"<p>{result}</p>"))
        # Clear the input field
        add_word_input.value = ""

# Create UI components for adding words
add_title = widgets.HTML("<h3 style='margin-top:30px;'>Add Words to Dictionary</h3>")
add_word_input = widgets.Text(placeholder="Enter a Tigrigna word to add")
add_button = widgets.Button(
    description="Add to Dictionary",
    button_style="primary",
    icon="plus"
)
add_output = widgets.Output()

# Register the callback
add_button.on_click(on_add_button_click)

# Display the UI
display(add_title)
display(widgets.HBox([add_word_input, add_button]))
display(add_output)

HTML(value="<h3 style='margin-top:30px;'>Add Words to Dictionary</h3>")

HBox(children=(Text(value='', placeholder='Enter a Tigrigna word to add'), Button(button_style='primary', desc…

Output()

## 7. Tigrigna Keyboard Layout

In [7]:
# Define the Tigrigna keyboard layout in rows
keyboard_rows = [
    ['ሀ', 'ለ', 'ሐ', 'መ', 'ሠ', 'ረ', 'ሰ', 'ሸ', 'ቀ', 'በ', 'ተ', 'ቸ', 'ኀ', 'ነ'],
    ['ኈ', 'አ', 'ከ', 'ኸ', 'ወ', 'ዐ', 'ዘ', 'ዠ', 'የ', 'ደ', 'ጀ', 'ገ', 'ጠ', 'ጨ'],
    ['ጰ', 'ጸ', 'ፀ', 'ፈ', 'ፐ', '፡', '።', '፣', '፤', '፥', '፦', '፧', '፨', 'Space']
]

def create_fidel_variations(base_char):
    """Create variations of a Tigrigna fidel based on vowel form"""
    # Special case for space
    if base_char == 'Space':
        return {'ä': ' '}
    
    # Special cases for punctuation
    if base_char in ['፡', '።', '፣', '፤', '፥', '፦', '፧', '፨']:
        return {'ä': base_char}
    
    # Get the Unicode value of the base character
    base_value = ord(base_char)
    
    # For each base character, the variations follow a pattern in Unicode
    variations = {
        'ä': chr(base_value),      # 1st form (base)
        'u': chr(base_value + 1),  # 2nd form
        'i': chr(base_value + 2),  # 3rd form
        'a': chr(base_value + 3),  # 4th form
        'e': chr(base_value + 4),  # 5th form
        'ə': chr(base_value + 5),  # 6th form
        'o': chr(base_value + 6)   # 7th form
    }
    
    return variations

def on_key_click(b):
    """Handle keyboard key clicks"""
    char = b.description
    if char == 'Space':
        text_input.value += ' '
    else:
        text_input.value += char
    
    # Set focus back to text area
    text_input._focus = True

def build_keyboard():
    """Build the Tigrigna keyboard UI"""
    keyboard_title = widgets.HTML("<h3 style='margin-top:30px;'>Tigrigna Keyboard</h3>")
    display(keyboard_title)
    
    # Create buttons for base form (first form) characters
    for row in keyboard_rows:
        buttons = []
        for char in row:
            if char == 'Space':
                btn = widgets.Button(description="Space", layout=widgets.Layout(width='80px'))
            else:
                btn = widgets.Button(description=char, layout=widgets.Layout(width='40px'))
            btn.on_click(on_key_click)
            buttons.append(btn)
        
        display(widgets.HBox(buttons))
    
    # Add a note about keyboard usage
    note = widgets.HTML("<p style='margin-top:10px;'><i>Click on a key to add it to the text box. For other Tigrigna character forms, type directly in the text box.</i></p>")
    display(note)

# Build and display the keyboard
build_keyboard()

HTML(value="<h3 style='margin-top:30px;'>Tigrigna Keyboard</h3>")

HBox(children=(Button(description='ሀ', layout=Layout(width='40px'), style=ButtonStyle()), Button(description='…

HBox(children=(Button(description='ኈ', layout=Layout(width='40px'), style=ButtonStyle()), Button(description='…

HBox(children=(Button(description='ጰ', layout=Layout(width='40px'), style=ButtonStyle()), Button(description='…

HTML(value="<p style='margin-top:10px;'><i>Click on a key to add it to the text box. For other Tigrigna charac…

## 8. Dictionary Information and Statistics

In [8]:
def analyze_dictionary():
    """Analyze the loaded dictionary and display statistics"""
    # Word length distribution
    length_counts = Counter(len(word) for word in tigrigna_dictionary)
    
    # Most common character n-grams
    all_bigrams = []
    for word in tigrigna_dictionary:
        all_bigrams.extend(character_ngrams(word, 2))
    common_bigrams = Counter(all_bigrams).most_common(10)
    
    # Display statistics
    html = "<div style='background:#f8f8f8; padding:15px; border-radius:5px;'>"
    html += f"<h3>Dictionary Statistics</h3>"
    html += f"<p><b>Total words:</b> {len(tigrigna_dictionary)}</p>"
    
    # Word length distribution
    html += "<p><b>Word length distribution:</b></p>"
    html += "<ul style='columns: 2;'>"
    for length, count in sorted(length_counts.items()):
        html += f"<li>{length} characters: {count} words</li>"
    html += "</ul>"
    
    # Common bigrams
    html += "<p><b>Most common character pairs:</b></p>"
    html += "<ul style='columns: 2;'>"
    for bigram, count in common_bigrams:
        html += f"<li>'{bigram}': {count} occurrences</li>"
    html += "</ul>"
    
    html += "</div>"
    display(HTML(html))

# Create a button to show dictionary statistics
stats_button = widgets.Button(
    description="Show Dictionary Statistics",
    button_style="info",
    icon="bar-chart",
    layout=widgets.Layout(margin='30px 0 0 0')
)

stats_output = widgets.Output()

def on_stats_button_click(b):
    with stats_output:
        clear_output()
        analyze_dictionary()

stats_button.on_click(on_stats_button_click)

display(stats_button)
display(stats_output)

Button(button_style='info', description='Show Dictionary Statistics', icon='bar-chart', layout=Layout(margin='…

Output()