# Tigrigna Dictionary Builder

This notebook processes a large Tigrigna corpus file and adds new words to the dictionary. It includes improved tokenization for handling punctuation and special characters.

In [None]:
import re

def load_dictionary(file_path='tigrigna_dictionary.txt'):
    """
    Loads the Tigrigna dictionary from the specified file path.
    
    Args:
        file_path (str): Path to the dictionary file
        
    Returns:
        set: A set containing Tigrigna words
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            dictionary = {line.strip() for line in file if line.strip()}
        return dictionary
    except FileNotFoundError:
        print(f"Error: Dictionary file '{file_path}' not found.")
        return set()
    except Exception as e:
        print(f"Error loading dictionary: {e}")
        return set()

# Load the current dictionary
existing_dictionary = load_dictionary()
print(f"Loaded {len(existing_dictionary)} words from existing dictionary")

In [None]:
def tokenize_text(text):
    """
    Tokenizes Tigrigna text into words, carefully removing punctuation and special characters.
    
    Args:
        text (str): The input Tigrigna text
        
    Returns:
        list: A list of clean Tigrigna words
    """
    # Define the Tigrigna character ranges
    # Ethiopic (including Tigrigna) Unicode ranges:
    # \u1200-\u137F (Ethiopic)
    # \u1380-\u139F (Ethiopic Supplement)
    # \u2D80-\u2DDF (Ethiopic Extended)
    # \uAB00-\uAB2F (Ethiopic Extended-A)
    
    # Find all continuous sequences of Tigrigna characters
    word_pattern = r'[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF\uAB00-\uAB2F]+'
    potential_words = re.findall(word_pattern, text)
    
    # Process each potential word to clean it further
    clean_words = []
    for word in potential_words:
        # Remove any non-Tigrigna characters that might have been included
        clean_word = ''.join(char for char in word 
                             if ('\u1200' <= char <= '\u137F') or 
                                ('\u1380' <= char <= '\u139F') or 
                                ('\u2D80' <= char <= '\u2DDF') or
                                ('\uAB00' <= char <= '\uAB2F'))
        
        # Add word if it's at least 2 characters long (to filter out isolated letters)
        if clean_word and len(clean_word) > 1:
            clean_words.append(clean_word)
    
    return clean_words

# Test the tokenization function with a sample text
sample_text = "ሰላም ዓለም። ኣነ ኣብ ገዛ ኣለኹ። ፡፣!፤?፥"
tokens = tokenize_text(sample_text)
print(f"Sample text: {sample_text}")
print(f"Tokenized words: {tokens}")

In [None]:
def process_small_corpus(corpus_file, dictionary_file):
    """
    Process a small corpus file and add new words to the dictionary
    
    Args:
        corpus_file (str): Path to the corpus file
        dictionary_file (str): Path to the dictionary file
        
    Returns:
        tuple: (total_words, new_words_added, new_words_list)
    """
    # Load existing dictionary
    existing_dictionary = load_dictionary(dictionary_file)
    print(f"Loaded {len(existing_dictionary)} words from existing dictionary")
    
    # New words to add
    new_words = set()
    total_words = 0
    
    # Process corpus file
    try:
        with open(corpus_file, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Tokenize the content
            tokens = tokenize_text(content)
            total_words = len(tokens)
            
            # Find new words
            for word in tokens:
                if word not in existing_dictionary and len(word) > 1:  # Ignore single characters
                    new_words.add(word)
    
        # Add new words to dictionary file
        if new_words:
            with open(dictionary_file, 'a', encoding='utf-8') as file:
                for word in sorted(new_words):
                    file.write(f"\n{word}")
            
            print(f"Added {len(new_words)} new words to the dictionary")
        else:
            print("No new words to add to the dictionary")
            
        return total_words, len(new_words), sorted(list(new_words))
            
    except FileNotFoundError:
        print(f"Error: Corpus file '{corpus_file}' not found.")
        return 0, 0, []
    except Exception as e:
        print(f"Error processing corpus: {e}")
        return 0, 0, []

In [None]:
# Process the example corpus file
corpus_file = "tigrigna_corpus.txt"
dictionary_file = "tigrigna_dictionary.txt"

print(f"Processing corpus file: {corpus_file}")
total_words, new_words_count, new_words_list = process_small_corpus(corpus_file, dictionary_file)

print(f"\nSummary:")
print(f"Total words processed: {total_words}")
print(f"New words added: {new_words_count}")

# Output the updated dictionary size
updated_dict = load_dictionary(dictionary_file)
print(f"Updated dictionary now contains {len(updated_dict)} words")

In [None]:
# Display the new words that were added
if new_words_list:
    print("New words added to the dictionary:")
    for word in new_words_list:
        print(f"- {word}")
else:
    print("No new words were added to the dictionary.")

## Processing a Large Corpus

The function below is designed to handle very large corpus files (millions of words) by processing the file in chunks to avoid memory issues.

In [None]:
def process_large_corpus(corpus_file, dictionary_file, chunk_size=1000000):
    """
    Process a large corpus file in chunks and add new words to the dictionary
    
    Args:
        corpus_file (str): Path to the corpus file
        dictionary_file (str): Path to the dictionary file
        chunk_size (int): Size of chunks to read at once
        
    Returns:
        tuple: (total_words, new_words_added)
    """
    # Load existing dictionary
    existing_dictionary = load_dictionary(dictionary_file)
    print(f"Loaded {len(existing_dictionary)} words from existing dictionary")
    
    # New words to add
    new_words = set()
    total_words = 0
    chunk_count = 0
    
    try:
        with open(corpus_file, 'r', encoding='utf-8') as file:
            while True:
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                    
                chunk_count += 1
                print(f"Processing chunk {chunk_count}...")
                
                # Tokenize the chunk
                tokens = tokenize_text(chunk)
                total_words += len(tokens)
                
                # Find new words
                for word in tokens:
                    if word not in existing_dictionary and word not in new_words and len(word) > 1:
                        new_words.add(word)
                        
                print(f"  Found {len(new_words)} unique new words so far...")
        
        # Add new words to dictionary file
        if new_words:
            print(f"Adding {len(new_words)} new words to the dictionary...")
            with open(dictionary_file, 'a', encoding='utf-8') as file:
                for word in sorted(new_words):
                    file.write(f"\n{word}")
            
            print(f"Successfully added {len(new_words)} new words to the dictionary")
        else:
            print("No new words to add to the dictionary")
            
        return total_words, len(new_words)
            
    except FileNotFoundError:
        print(f"Error: Corpus file '{corpus_file}' not found.")
        return 0, 0
    except Exception as e:
        print(f"Error processing corpus: {e}")
        return 0, 0

In [None]:
# To process your large corpus file, replace "your_large_corpus_file.txt" with your file path
# and uncomment the code below

# large_corpus_file = "your_large_corpus_file.txt"  # Replace with your large corpus file path
# print(f"Processing large corpus file: {large_corpus_file}")
# total_words, new_words = process_large_corpus(large_corpus_file, dictionary_file)
# 
# print(f"\nSummary:")
# print(f"Total words processed: {total_words}")
# print(f"New words added: {new_words}")
# 
# # Output the updated dictionary size
# updated_dict = load_dictionary(dictionary_file)
# print(f"Updated dictionary now contains {len(updated_dict)} words")

## Advanced Options: Filtering and Statistics

In [None]:
def analyze_dictionary(dictionary_file):
    """
    Analyze the dictionary to provide statistics
    
    Args:
        dictionary_file (str): Path to the dictionary file
    """
    words = load_dictionary(dictionary_file)
    
    if not words:
        print("Dictionary is empty or could not be loaded.")
        return
        
    print(f"Dictionary contains {len(words)} words")
    
    # Word length distribution
    lengths = [len(word) for word in words]
    avg_length = sum(lengths) / len(lengths)
    min_length = min(lengths)
    max_length = max(lengths)
    
    print(f"Word length statistics:")
    print(f"  - Average length: {avg_length:.2f} characters")
    print(f"  - Minimum length: {min_length} characters")
    print(f"  - Maximum length: {max_length} characters")
    
    # Count words by length
    length_counts = {}
    for length in lengths:
        if length in length_counts:
            length_counts[length] += 1
        else:
            length_counts[length] = 1
            
    print("\nWord count by length:")
    for length in sorted(length_counts.keys()):
        print(f"  - {length} characters: {length_counts[length]} words")

In [None]:
# Analyze the current dictionary
analyze_dictionary("tigrigna_dictionary.txt")