# ትግርኛ - Real-time Tigrigna Autocomplete Spell Checker

This notebook provides real-time autocomplete suggestions for Tigrigna text as you type.

In [1]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import re
import os
from collections import defaultdict
import threading
import time

print("Tigrigna Real-time Autocomplete Spell Checker initialized.")

Tigrigna Real-time Autocomplete Spell Checker initialized.


In [2]:
def load_tigrigna_dictionary():
    """Load Tigrigna words from multiple possible locations"""
    words = set()
    
    # Try different file paths
    paths_to_try = [
        'data/tigrigna_words.txt',
        'tigrigna_dictionary.txt',
        'data/tigrigna_dictionary.txt'
    ]
    
    for path in paths_to_try:
        try:
            with open(path, 'r', encoding='utf-8') as f:
                loaded_words = {line.strip() for line in f if line.strip()}
                words.update(loaded_words)
                print(f"Loaded {len(loaded_words)} words from {path}")
                break
        except FileNotFoundError:
            continue
    
    # If no file found, use sample words
    if not words:
        words = {
            'ሰላም', 'ሰላሳ', 'ሰላዮ', 'ሰላሞት', 'ሰላሲ', 'ሰላሳይ', 'ሰላዳ', 'ሰላም', 'ሰላምታ', 'ሰላማዊ',
            'ከመይ', 'ከመይካ', 'ከምዚ', 'ከምዚ', 'ከምዚይ', 
            'ሓጺር', 'ሓጺርካ', 'ሓጺራ', 'ሓጺሮም', 'ሓጺረ',
            'ነዊሕ', 'ነዊሑ', 'ነዊሖም', 'ነዊሓት',
            'ዓቢ', 'ዓቢያ', 'ዓቢያት', 'ዓባይ', 'ዓባያት',
            'ንእሽቶ', 'ንእሽተይ', 'ንእሽቱ', 'ንእሽተይቲ',
            'ጽቡቕ', 'ጽቡቓት', 'ጽቡቅ', 'ጽቡቅነት',
            'ሕማም', 'ሕማማት', 'ሕሙም', 'ሕሙማት',
            'ሰብ', 'ሰባት', 'ሰብነት', 'ሰብዓይ',
            'ቤት', 'ቤታት', 'ቤተሰብ', 'ቤተክርስትያን',
            'መኪና', 'መኪናታት', 'መኪኖች', 'መኪኖታት',
            'ማይ', 'ማያት', 'ማይቲ', 'ማይኮን',
            'እዋን', 'እዋናት', 'እዋናዊ', 'እዋንን',
            'ግዜ', 'ግዜታት', 'ግዜያዊ', 'ግዜ',
            'ዓመት', 'ዓመታት', 'ዓመታዊ', 'ዓመተ',
            'ወርሒ', 'ወርሓት', 'ወርሓዊ', 'ወርሓይ',
            'መዓልቲ', 'መዓልታት', 'መዓልታዊ', 'መዓልትን',
            'ዕለት', 'ዕለታት', 'ዕለታዊ', 'ዕለተ',
            'ኢትዮጵያ', 'ኢትዮጵያዊ', 'ኢትዮጵያውያን', 'ኢትዮጵያዊት',
            'ኤርትራ', 'ኤርትራዊ', 'ኤርትራውያን', 'ኤርትራዊት',
            'ትግራይ', 'ትግራዋይ', 'ትግራውያን', 'ትግራዊት',
            'ትግርኛ', 'ትግርኛዊ', 'ትግርኛውያን', 'ትግርኛዊት'
        }
        print(f"Using sample dictionary with {len(words)} words")
    
    print(f"Total dictionary size: {len(words)} words")
    return words

# Load the dictionary
tigrigna_words = load_tigrigna_dictionary()

Loaded 294 words from data/tigrigna_words.txt
Total dictionary size: 294 words


In [3]:
def build_prefix_tree(words):
    """Build a prefix tree (trie) for fast prefix matching"""
    prefix_tree = defaultdict(list)
    
    for word in words:
        # For each word, add it to all possible prefixes
        for i in range(1, len(word) + 1):
            prefix = word[:i]
            prefix_tree[prefix].append(word)
    
    # Sort words by length and alphabetically for better suggestions
    for prefix in prefix_tree:
        prefix_tree[prefix] = sorted(set(prefix_tree[prefix]), key=lambda x: (len(x), x))
    
    return prefix_tree

def get_word_suggestions(prefix, prefix_tree, max_suggestions=10):
    """Get word suggestions for a given prefix"""
    if not prefix or len(prefix.strip()) == 0:
        return []
    
    prefix = prefix.strip()
    suggestions = prefix_tree.get(prefix, [])
    
    # If we have exact matches, return them
    if suggestions:
        return suggestions[:max_suggestions]
    
    # If no exact prefix match, try to find similar words
    similar_suggestions = []
    for word in tigrigna_words:
        if word.startswith(prefix):
            similar_suggestions.append(word)
    
    return sorted(similar_suggestions, key=lambda x: (len(x), x))[:max_suggestions]

# Build the prefix tree
print("Building prefix tree for fast suggestions...")
prefix_tree = build_prefix_tree(tigrigna_words)
print(f"Prefix tree built with {len(prefix_tree)} prefixes")

# Test the suggestion system
test_prefix = "ሰላ"
test_suggestions = get_word_suggestions(test_prefix, prefix_tree)
print(f"Test suggestions for '{test_prefix}': {test_suggestions}")

Building prefix tree for fast suggestions...
Prefix tree built with 720 prefixes
Test suggestions for 'ሰላ': ['ሰላም', 'ሰላማት']


In [4]:
def tokenize_tigrigna_text(text):
    """Split Tigrigna text into words"""
    # Replace Ethiopic punctuation with spaces
    cleaned = re.sub(r'[፡።፣፤፥፧፦፨፠፟\s]+', ' ', text)
    words = [word.strip() for word in cleaned.split() if word.strip()]
    return words

def get_current_word_and_position(text, cursor_pos=None):
    """Get the current word being typed and its position"""
    if cursor_pos is None:
        cursor_pos = len(text)
    
    # Find word boundaries around cursor position
    start = cursor_pos
    end = cursor_pos
    
    # Find start of current word
    while start > 0 and text[start - 1] not in ' ፡።፣፤፥፧፦፨፠፟\n\t':
        start -= 1
    
    # Find end of current word
    while end < len(text) and text[end] not in ' ፡።፣፤፥፧፦፨፠፟\n\t':
        end += 1
    
    current_word = text[start:end]
    return current_word, start, end

# Test the word extraction
test_text = "ሰላም ከመይ ሓዲርኩም ሰላ"
current_word, start, end = get_current_word_and_position(test_text)
print(f"Current word in '{test_text}': '{current_word}' (position {start}-{end})")

Current word in 'ሰላም ከመይ ሓዲርኩም ሰላ': 'ሰላ' (position 14-16)


In [5]:
# Global variables for debouncing
suggestion_timer = None
last_input_time = 0

def show_suggestions_delayed():
    """Show suggestions after a short delay to avoid too frequent updates"""
    global last_input_time
    current_time = time.time()
    
    # Only show suggestions if no new input in the last 300ms
    if current_time - last_input_time >= 0.3:
        show_suggestions()

def show_suggestions():
    """Display suggestions for the current input"""
    text = text_input.value
    if not text.strip():
        with suggestions_output:
            clear_output()
            display(HTML("<div style='color: #666; padding: 10px;'>Start typing to see suggestions...</div>"))
        return
    
    # Get the current word being typed
    current_word, start_pos, end_pos = get_current_word_and_position(text)
    
    if len(current_word) == 0:
        with suggestions_output:
            clear_output()
            display(HTML("<div style='color: #666; padding: 10px;'>Continue typing to see suggestions...</div>"))
        return
    
    # Get suggestions for the current word
    suggestions = get_word_suggestions(current_word, prefix_tree, max_suggestions=10)
    
    with suggestions_output:
        clear_output()
        
        if suggestions:
            html = f"<div style='border: 1px solid #ddd; border-radius: 5px; background: white;'>"
            html += f"<div style='background: #f5f5f5; padding: 8px; border-bottom: 1px solid #ddd; font-weight: bold;'>Suggestions for '{current_word}':</div>"
            
            for i, suggestion in enumerate(suggestions, 1):
                # Highlight the matching prefix
                if suggestion.startswith(current_word):
                    highlighted = f"<strong>{current_word}</strong>{suggestion[len(current_word):]}"
                else:
                    highlighted = suggestion
                
                html += f"<div style='padding: 6px 12px; border-bottom: 1px solid #eee; cursor: pointer;' "
                html += f"onmouseover='this.style.backgroundColor=\"#e3f2fd\"' "
                html += f"onmouseout='this.style.backgroundColor=\"white\"' "
                html += f"onclick='selectSuggestion(\"{suggestion}\", {start_pos}, {end_pos})'>"
                html += f"<span style='color: #1976d2; margin-right: 8px;'>{i}.</span>{highlighted}"
                html += "</div>"
            
            html += "</div>"
            
            # Add JavaScript for suggestion selection
            js_code = f"""
            <script>
            function selectSuggestion(word, startPos, endPos) {{
                // Find the textarea element
                var textareas = document.querySelectorAll('textarea');
                var textarea = null;
                for (var i = 0; i < textareas.length; i++) {{
                    if (textareas[i].placeholder && textareas[i].placeholder.includes('ትግርኛ')) {{
                        textarea = textareas[i];
                        break;
                    }}
                }}
                
                if (textarea) {{
                    var currentText = textarea.value;
                    var newText = currentText.substring(0, startPos) + word + currentText.substring(endPos);
                    textarea.value = newText;
                    
                    // Trigger input event to update the widget value
                    var event = new Event('input', {{ bubbles: true }});
                    textarea.dispatchEvent(event);
                    
                    // Set cursor position after the inserted word
                    var newCursorPos = startPos + word.length;
                    textarea.setSelectionRange(newCursorPos, newCursorPos);
                    textarea.focus();
                }}
            }}
            </script>
            """
            
            display(HTML(html + js_code))
        else:
            display(HTML(f"<div style='color: #999; padding: 10px; font-style: italic;'>No suggestions found for '{current_word}'</div>"))

def on_text_change(change):
    """Handle text input changes"""
    global suggestion_timer, last_input_time
    
    last_input_time = time.time()
    
    # Cancel previous timer
    if suggestion_timer:
        suggestion_timer.cancel()
    
    # Set new timer for delayed suggestion update
    suggestion_timer = threading.Timer(0.3, show_suggestions_delayed)
    suggestion_timer.start()

print("Real-time suggestion system ready.")

Real-time suggestion system ready.


In [6]:
# Create the user interface
title_html = """
<div style='text-align: center; margin: 20px 0;'>
    <h1 style='color: #1976d2; margin: 0;'>ትግርኛ</h1>
    <h2 style='color: #424242; margin: 5px 0;'>Real-time Tigrigna Autocomplete</h2>
    <p style='color: #666; margin: 10px 0;'>Type Tigrigna text below and get instant word suggestions</p>
</div>
"""

# Text input area
text_input = widgets.Textarea(
    value='',
    placeholder='ኣብዚ ትግርኛ ጽሑፍካ ጽሓፍ... (Type your Tigrigna text here...)',
    description='',
    layout=widgets.Layout(
        width='100%',
        height='120px',
        margin='10px 0'
    ),
    style={'font_size': '16px'}
)

# Output area for suggestions
suggestions_output = widgets.Output(
    layout=widgets.Layout(
        width='100%',
        min_height='200px',
        margin='10px 0'
    )
)

# Stats area
stats_output = widgets.Output(
    layout=widgets.Layout(
        width='100%',
        margin='10px 0'
    )
)

# Connect the text change handler
text_input.observe(on_text_change, names='value')

# Display the interface
display(HTML(title_html))
display(text_input)
display(suggestions_output)
display(stats_output)

# Initialize with empty suggestions
with suggestions_output:
    display(HTML("<div style='color: #666; padding: 10px;'>Start typing to see suggestions...</div>"))

print("Tigrigna Real-time Autocomplete is ready! Start typing to see suggestions.")

Textarea(value='', layout=Layout(height='120px', margin='10px 0', width='100%'), placeholder='ኣብዚ ትግርኛ ጽሑፍካ ጽሓ…

Output(layout=Layout(margin='10px 0', min_height='200px', width='100%'))

Output(layout=Layout(margin='10px 0', width='100%'))

Tigrigna Real-time Autocomplete is ready! Start typing to see suggestions.


In [7]:
def update_stats():
    """Update and display text statistics"""
    text = text_input.value
    if not text.strip():
        with stats_output:
            clear_output()
        return
    
    words = tokenize_tigrigna_text(text)
    word_count = len(words)
    char_count = len(text)
    char_count_no_spaces = len(text.replace(' ', ''))
    
    # Count recognized vs unrecognized words
    recognized_words = 0
    for word in words:
        if word in tigrigna_words:
            recognized_words += 1
    
    unrecognized_words = word_count - recognized_words
    
    with stats_output:
        clear_output()
        stats_html = f"""
        <div style='background: #f5f5f5; padding: 12px; border-radius: 5px; margin-top: 10px;'>
            <div style='display: flex; justify-content: space-around; text-align: center;'>
                <div>
                    <div style='font-size: 20px; font-weight: bold; color: #1976d2;'>{word_count}</div>
                    <div style='color: #666; font-size: 12px;'>Words</div>
                </div>
                <div>
                    <div style='font-size: 20px; font-weight: bold; color: #388e3c;'>{recognized_words}</div>
                    <div style='color: #666; font-size: 12px;'>Recognized</div>
                </div>
                <div>
                    <div style='font-size: 20px; font-weight: bold; color: #d32f2f;'>{unrecognized_words}</div>
                    <div style='color: #666; font-size: 12px;'>Unknown</div>
                </div>
                <div>
                    <div style='font-size: 20px; font-weight: bold; color: #7b1fa2;'>{char_count}</div>
                    <div style='color: #666; font-size: 12px;'>Characters</div>
                </div>
            </div>
        </div>
        """
        display(HTML(stats_html))

# Create a more comprehensive text change handler
def on_text_change_complete(change):
    """Handle text changes with both suggestions and stats"""
    on_text_change(change)  # Handle suggestions
    
    # Update stats after a short delay
    def delayed_stats_update():
        time.sleep(0.5)
        update_stats()
    
    stats_timer = threading.Timer(0.5, update_stats)
    stats_timer.start()

# Replace the text change handler
text_input.unobserve_all()
text_input.observe(on_text_change_complete, names='value')

print("Enhanced real-time autocomplete with statistics is ready!")

Enhanced real-time autocomplete with statistics is ready!
