# Enhanced Tigrigna Real-time Spell Checker

This notebook creates an improved Tigrigna spell checker with real-time word suggestions using advanced techniques.

In [1]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import re
import os
import math
from collections import Counter
import heapq

## 1. Load the Tigrigna Dictionary

In [2]:
def load_dictionary(file_path='tigrigna_dictionary.txt'):
    """Load the Tigrigna dictionary"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            dictionary = {line.strip() for line in file if line.strip()}
        return dictionary
    except FileNotFoundError:
        print(f"Error: Dictionary file '{file_path}' not found.")
        return set()
    except Exception as e:
        print(f"Error loading dictionary: {e}")
        return set()

# Load the dictionary
dictionary = load_dictionary()
print(f"Loaded {len(dictionary)} words from the dictionary")

Loaded 294 words from the dictionary


## 2. Advanced Word Suggestion Algorithms

In [3]:
# Build character n-grams for all dictionary words
def build_character_ngrams(dictionary, n=2):
    """Build character n-grams for dictionary words"""
    ngrams = {}
    for word in dictionary:
        word_ngrams = set()
        for i in range(len(word) - n + 1):
            ngram = word[i:i+n]
            word_ngrams.add(ngram)
        
        for ngram in word_ngrams:
            if ngram not in ngrams:
                ngrams[ngram] = set()
            ngrams[ngram].add(word)
    
    return ngrams

# Build the ngrams index
bigrams_index = build_character_ngrams(dictionary, n=2)
trigrams_index = build_character_ngrams(dictionary, n=3)

# Advanced edit distance with weighted operations
def weighted_edit_distance(s1, s2, insert_cost=1.0, delete_cost=1.0, substitute_cost=1.0):
    """Calculate edit distance with weighted operations"""
    if len(s1) < len(s2):
        return weighted_edit_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1) * delete_cost
    
    previous_row = [float(i) * delete_cost for i in range(len(s2) + 1)]
    
    for i, c1 in enumerate(s1):
        current_row = [float(i + 1) * insert_cost]
        for j, c2 in enumerate(s2):
            # Calculate costs for operations
            insertions = previous_row[j + 1] + insert_cost
            deletions = current_row[j] + delete_cost
            substitutions = previous_row[j] + (substitute_cost if c1 != c2 else 0)
            
            current_row.append(min(insertions, deletions, substitutions))
        
        previous_row = current_row
    
    return previous_row[-1]

# Context-aware edit distance that considers character positions
def context_aware_edit_distance(s1, s2):
    """Calculate edit distance with position-based weights"""
    if len(s1) < len(s2):
        return context_aware_edit_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1)
    
    # Position-based weight factor (errors at the beginning are more significant)
    def position_weight(pos, length):
        # More weight to beginning and end of words
        if pos < length / 3:
            return 1.5  # Beginning of word
        elif pos > (2 * length) / 3:
            return 1.2  # End of word
        else:
            return 1.0  # Middle of word
    
    previous_row = list(range(len(s2) + 1))
    
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            # Position-based weight
            weight = position_weight(j, len(s2))
            
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (weight if c1 != c2 else 0)
            
            current_row.append(min(insertions, deletions, substitutions))
        
        previous_row = current_row
    
    return previous_row[-1]

# Ngram Jaccard similarity for quick candidate filtering
def ngram_similarity(word1, word2, n=2):
    """Calculate Jaccard similarity based on character n-grams"""
    if len(word1) < n or len(word2) < n:
        return 0.0
    
    # Generate n-grams for each word
    ngrams1 = set(word1[i:i+n] for i in range(len(word1) - n + 1))
    ngrams2 = set(word2[i:i+n] for i in range(len(word2) - n + 1))
    
    # Calculate Jaccard similarity: |A ∩ B| / |A ∪ B|
    intersection = len(ngrams1.intersection(ngrams2))
    union = len(ngrams1.union(ngrams2))
    
    return intersection / union if union > 0 else 0.0

# Advanced suggestion algorithm that combines multiple metrics
def get_enhanced_suggestions(word, dictionary, bigrams_index, trigrams_index, max_suggestions=5):
    """Get enhanced word suggestions combining multiple metrics"""
    if not word or len(word) < 2:
        return []
    
    # If the word is already correct, no need for suggestions
    if word in dictionary:
        return [word]
    
    # Fast pre-filtering using n-gram index
    candidates = set()
    
    # Extract bigrams from the input word
    word_bigrams = set()
    for i in range(len(word) - 1):
        bigram = word[i:i+2]
        if bigram in bigrams_index:
            word_bigrams.add(bigram)
            candidates.update(bigrams_index[bigram])
    
    # Extract trigrams from the input word
    word_trigrams = set()
    for i in range(len(word) - 2):
        trigram = word[i:i+3]
        if trigram in trigrams_index:
            word_trigrams.add(trigram)
            candidates.update(trigrams_index[trigram])
    
    # If no candidates found through n-grams, fall back to prefix matching
    if not candidates and len(word) >= 2:
        prefix = word[:2]
        candidates = {dict_word for dict_word in dictionary if dict_word.startswith(prefix)}
    
    # If still no candidates, include all dictionary words (for very short input)
    if not candidates:
        candidates = dictionary
    
    # Calculate combined score for each candidate
    scored_candidates = []
    
    for candidate in candidates:
        # Skip candidates that are too different in length
        if abs(len(candidate) - len(word)) > min(len(word), 3):
            continue
        
        # Calculate different distance metrics
        edit_dist = weighted_edit_distance(word, candidate, 
                                         insert_cost=1.0, 
                                         delete_cost=1.2, 
                                         substitute_cost=1.5)
        
        context_dist = context_aware_edit_distance(word, candidate)
        ngram_sim = ngram_similarity(word, candidate, n=2)
        
        # Combined score (lower is better)
        # Weighted combination of metrics
        combined_score = (0.4 * edit_dist) + (0.4 * context_dist) - (0.2 * ngram_sim * 10)
        
        # Bonus for prefix match
        prefix_match_len = 0
        for i in range(min(len(word), len(candidate))):
            if word[i] == candidate[i]:
                prefix_match_len += 1
            else:
                break
        
        # Apply prefix match bonus
        if prefix_match_len >= 2:
            combined_score -= 0.5 * prefix_match_len / len(word)
        
        # Collect scored candidates
        scored_candidates.append((candidate, combined_score))
    
    # Sort by score (lower is better)
    scored_candidates.sort(key=lambda x: x[1])
    
    # Extract just the words
    suggestions = [word for word, _ in scored_candidates[:max_suggestions]]
    
    return suggestions

def generate_variants(base_char):
    """Generate vowel variants for a Tigrigna character"""
    if base_char == ' ' or not base_char or len(base_char) != 1:
        return [base_char]
    
    try:
        base_code = ord(base_char)
        variants = []
        for i in range(8):  # Include the 8th form if it exists (base+7)
            try:
                variant_code = base_code + i
                if 0xD800 <= variant_code <= 0xDFFF or variant_code > 0x10FFFF:
                    # Skip invalid Unicode code points
                    continue
                    
                variant = chr(variant_code)
                variants.append(variant)
            except:
                # If this variant doesn't exist, skip it
                pass
        
        return variants if variants else [base_char]
    except:
        return [base_char]

## 3. Create Enhanced Real-time Spell Checker UI

In [4]:
def create_enhanced_realtime_ui():
    """Create an enhanced real-time spell checker UI"""
    # Main container with styling
    main_container = widgets.VBox(layout=widgets.Layout(width='100%'))
    
    # Create title and header
    header = widgets.HTML(
        value='<h1 style="text-align:center; color:#4d66eb;">ትግርኛ - Enhanced Tigrigna Spell Checker</h1>'
        '<p style="text-align:center;">Type or paste Tigrigna text below. Spelling suggestions appear instantly!</p>'
    )
    
    # Create text area for input
    text_area = widgets.Textarea(
        placeholder='ኣብዚ ይጻሓፉ... (Type Tigrigna here...)',
        layout=widgets.Layout(width='100%', height='100px')
    )
    
    # Create word suggestions bar
    suggestions_container = widgets.HBox(
        [],
        layout=widgets.Layout(
            width='100%', 
            padding='10px', 
            margin='10px 0', 
            border='1px solid #eaeaea',
            border_radius='5px',
            background_color='#f8f9fa',
            min_height='50px'
        )
    )
    
    # Create the results display area
    results_display = widgets.HTML(
        value='<div id="spell-check-results" style="padding: 15px; border: 1px solid #eaeaea; border-radius: 5px; margin-top: 10px; min-height: 100px;">' +
              '<p style="color: #6c757d;">Spell check results will appear here as you type...</p></div>',
        layout=widgets.Layout(width='100%')
    )
    
    # Create container for vowel variants
    variants_container = widgets.HBox(
        [],
        layout=widgets.Layout(
            width='100%',
            padding='10px',
            margin='10px 0',
            border='1px solid #eaeaea',
            border_radius='5px',
            background_color='#f8f9fa',
            justify_content='center',
            display='flex'
        )
    )
    
    # Create keyboard layout
    keyboard_container = widgets.VBox(
        [],
        layout=widgets.Layout(
            width='100%',
            padding='10px',
            margin='10px 0',
            border='1px solid #eaeaea',
            border_radius='5px',
            background_color='#ffffff'
        )
    )
    
    # Define rows of Tigrigna keyboard
    row1 = ['ሀ', 'ለ', 'ሐ', 'መ', 'ሠ', 'ረ', 'ሰ', 'ሸ', 'ቀ', 'በ']
    row2 = ['ተ', 'ቸ', 'ኀ', 'ነ', 'ኘ', 'አ', 'ከ', 'ኸ', 'ወ', 'ዐ']
    row3 = ['ዘ', 'ዠ', 'የ', 'ደ', 'ጀ', 'ገ', 'ጠ', 'ጨ', 'ጰ', 'ጸ']
    row4 = ['ፀ', 'ፈ', 'ፐ', 'ቨ', 'ቦ', 'ቱ', 'ሙ', 'ሉ', 'ኢ', 'ኣ']
    
    # Current state tracking
    current_word = ['']
    current_cursor_pos = [0]
    current_base_char = [None]
    
    # Create keyboard buttons
    def create_keyboard_buttons(row_chars):
        buttons = []
        for char in row_chars:
            btn = widgets.Button(
                description=char,
                layout=widgets.Layout(width='auto', min_width='40px', margin='3px'),
                style=widgets.ButtonStyle(button_color='#f8f9fa')
            )
            
            # Add click handler
            btn.on_click(lambda b, c=char: on_keyboard_button_click(c))
            buttons.append(btn)
        
        return widgets.HBox(buttons, layout=widgets.Layout(justify_content='center'))
    
    # Create special buttons (space, backspace, etc.)
    def create_special_buttons():
        special_buttons = []
        
        # Backspace button
        backspace_btn = widgets.Button(
            description='⌫',
            layout=widgets.Layout(width='60px', margin='3px'),
            style=widgets.ButtonStyle(button_color='#e9ecef')
        )
        backspace_btn.on_click(lambda b: on_backspace_click())
        
        # Space button
        space_btn = widgets.Button(
            description='Space',
            layout=widgets.Layout(width='200px', margin='3px'),
            style=widgets.ButtonStyle(button_color='#e9ecef')
        )
        space_btn.on_click(lambda b: on_space_click())
        
        # Punctuation buttons
        punct1_btn = widgets.Button(
            description='፡',
            layout=widgets.Layout(width='40px', margin='3px'),
            style=widgets.ButtonStyle(button_color='#e9ecef')
        )
        punct1_btn.on_click(lambda b: on_keyboard_button_click('፡'))
        
        punct2_btn = widgets.Button(
            description='።',
            layout=widgets.Layout(width='40px', margin='3px'),
            style=widgets.ButtonStyle(button_color='#e9ecef')
        )
        punct2_btn.on_click(lambda b: on_keyboard_button_click('።'))
        
        special_buttons.extend([backspace_btn, punct1_btn, space_btn, punct2_btn])
        return widgets.HBox(special_buttons, layout=widgets.Layout(justify_content='center'))
    
    # Handle keyboard button clicks
    def on_keyboard_button_click(char):
        if char in ['፡', '።']:
            # For punctuation, just insert the character
            insert_text(char)
        else:
            # For Tigrigna characters, show variants
            current_base_char[0] = char
            show_variants(char)
            # Also insert the character
            insert_text(char)
    
    # Handle backspace button click
    def on_backspace_click():
        cursor_pos = text_area.cursor_pos
        text = text_area.value
        
        if cursor_pos > 0:
            # Delete one character before cursor
            new_text = text[:cursor_pos-1] + text[cursor_pos:]
            text_area.value = new_text
            text_area.cursor_pos = cursor_pos - 1
            
            # Update current cursor position
            current_cursor_pos[0] = text_area.cursor_pos
            
            # Update suggestions and results
            update_suggestions_and_results()
    
    # Handle space button click
    def on_space_click():
        insert_text(' ')
    
    # Insert text at cursor position
    def insert_text(text):
        cursor_pos = text_area.cursor_pos
        current_text = text_area.value
        
        # Insert text at cursor position
        new_text = current_text[:cursor_pos] + text + current_text[cursor_pos:]
        text_area.value = new_text
        
        # Move cursor after inserted text
        text_area.cursor_pos = cursor_pos + len(text)
        
        # Update current cursor position
        current_cursor_pos[0] = text_area.cursor_pos
        
        # Update suggestions and results
        update_suggestions_and_results()
    
    # Show vowel variants for a base character
    def show_variants(base_char):
        variants = generate_variants(base_char)
        variant_buttons = []
        
        for variant in variants:
            btn = widgets.Button(
                description=variant,
                layout=widgets.Layout(width='auto', min_width='40px', margin='3px'),
                style=widgets.ButtonStyle(button_color='#4CAF50', text_color='white')
            )
            
            # Add click handler for variant
            btn.on_click(lambda b, v=variant: on_variant_click(v))
            variant_buttons.append(btn)
        
        variants_container.children = tuple(variant_buttons)
    
    # Handle variant button click
    def on_variant_click(variant):
        # Get current text and cursor position
        cursor_pos = text_area.cursor_pos
        text = text_area.value
        
        # If there's a base character at cursor position - 1, replace it
        if cursor_pos > 0 and current_base_char[0] is not None:
            # Replace the base character with the variant
            new_text = text[:cursor_pos-1] + variant + text[cursor_pos:]
            text_area.value = new_text
            text_area.cursor_pos = cursor_pos
        else:
            # Otherwise just insert the variant
            insert_text(variant)
        
        # Update suggestions and results
        update_suggestions_and_results()
    
    # Create a suggestion button
    def create_suggestion_button(word):
        btn = widgets.Button(
            description=word,
            layout=widgets.Layout(margin='3px'),
            style=widgets.ButtonStyle(button_color='#e9ecef')
        )
        
        def on_suggestion_click(b):
            # Extract current text and position
            cursor_pos = text_area.cursor_pos
            text = text_area.value
            
            # Find the word being replaced
            text_before = text[:cursor_pos]
            
            # Find the last word boundary
            word_pattern = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]+')
            matches = list(word_pattern.finditer(text_before))
            
            if matches:
                last_match = matches[-1]
                start_pos = last_match.start()
                end_pos = last_match.end()
                
                # Replace the current word with the suggestion
                new_text = text[:start_pos] + word + text[end_pos:]
                text_area.value = new_text
                
                # Set cursor position after the inserted word
                text_area.cursor_pos = start_pos + len(word)
                current_cursor_pos[0] = text_area.cursor_pos
                
                # Update display
                update_suggestions_and_results()
        
        btn.on_click(on_suggestion_click)
        return btn
    
    # Update the suggestions and spell check results
    def update_suggestions_and_results():
        text = text_area.value
        
        # If text is empty, clear suggestions and results
        if not text.strip():
            suggestions_container.children = ()
            results_display.value = '<div id="spell-check-results" style="padding: 15px; border: 1px solid #eaeaea; border-radius: 5px; margin-top: 10px; min-height: 100px;">' + \
                                  '<p style="color: #6c757d;">Spell check results will appear here as you type...</p></div>'
            return
        
        # Extract the current word at cursor
        cursor_pos = text_area.cursor_pos
        text_before = text[:cursor_pos]
        
        # Find the current word pattern
        word_pattern = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]+')
        matches = list(word_pattern.finditer(text_before))
        
        # Update suggestions if there's a current word
        if matches:
            current_word[0] = matches[-1].group(0)
            
            # Get enhanced suggestions
            suggestions = get_enhanced_suggestions(current_word[0], dictionary, bigrams_index, trigrams_index)
            
            # Create suggestion buttons
            suggestion_buttons = [create_suggestion_button(word) for word in suggestions]
            suggestions_container.children = tuple(suggestion_buttons)
        else:
            current_word[0] = ''
            suggestions_container.children = ()
        
        # Update spell check results
        update_spell_check_results(text)
    
    # Update the spell check results display
    def update_spell_check_results(text):
        # Tokenize the text
        word_pattern = re.compile(r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]+')
        words = [match.group(0) for match in word_pattern.finditer(text)]
        
        if not words:
            results_display.value = '<div id="spell-check-results" style="padding: 15px; border: 1px solid #eaeaea; border-radius: 5px; margin-top: 10px; min-height: 100px;">' + \
                                  '<p style="color: #6c757d;">No Tigrigna words detected...</p></div>'
            return
        
        # Check each word
        results_html = '<div id="spell-check-results" style="padding: 15px; border: 1px solid #eaeaea; border-radius: 5px; margin-top: 10px;">' + \
                      '<h3>Spell Check Results:</h3>' + \
                      '<div style="display: flex; flex-wrap: wrap; gap: 10px;">'  # Results container
        
        # Processed words with their results
        processed_results = []
        
        for word in words:
            is_correct = word in dictionary
            
            if is_correct:
                word_html = f'<div style="padding: 8px; background-color: #d4edda; color: #155724; border-radius: 4px; display: inline-block;">{word} ✓</div>'
            else:
                # Get suggestions
                suggestions = get_enhanced_suggestions(word, dictionary, bigrams_index, trigrams_index, max_suggestions=3)
                suggestions_text = ', '.join(suggestions) if suggestions else 'None available'
                
                word_html = f'<div style="padding: 8px; background-color: #f8d7da; color: #721c24; border-radius: 4px; display: inline-block;" title="Suggestions: {suggestions_text}">{word} ✗</div>'
            
            processed_results.append(word_html)
        
        # Add all results
        results_html += ''.join(processed_results)
        results_html += '</div></div>'
        
        # Update the results display
        results_display.value = results_html
    
    # Add keyboard rows
    keyboard_rows = []
    keyboard_rows.append(create_keyboard_buttons(row1))
    keyboard_rows.append(create_keyboard_buttons(row2))
    keyboard_rows.append(create_keyboard_buttons(row3))
    keyboard_rows.append(create_keyboard_buttons(row4))
    keyboard_rows.append(create_special_buttons())
    
    # Update keyboard container
    keyboard_container.children = tuple(keyboard_rows)
    
    # Add observer for text area changes
    def on_text_change(change):
        if change['name'] == 'value':
            # Update suggestions and results
            update_suggestions_and_results()
    
    text_area.observe(on_text_change, names='value')
    
    # Assemble the main UI
    main_container.children = [
        header,
        text_area,
        suggestions_container,
        variants_container,
        results_display,
        keyboard_container
    ]
    
    return main_container

## 4. Create and Display the Enhanced UI

In [6]:
# Create and display the enhanced UI
enhanced_ui = create_enhanced_realtime_ui()
display(enhanced_ui)

VBox(children=(HTML(value='<h1 style="text-align:center; color:#4d66eb;">ትግርኛ - Enhanced Tigrigna Spell Checke…