In [None]:
# Amharic Text Preprocessing for E-commerce NER

This notebook handles text cleaning, tokenization, and normalization for Amharic e-commerce data.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path

# For Amharic text processing
# You may need to install: pip install ethiopic-calendar ethiopic-numbers
import unicodedata


In [None]:
## 1. Data Loading


In [None]:
# Load raw Telegram data
data_path = '../data/raw/'
labeled_path = '../data/labeled/'

# Load your raw data files here
# raw_data = pd.read_csv(os.path.join(data_path, 'telegram_messages.csv'))


In [None]:
## 2. Text Cleaning Functions


In [None]:
def clean_amharic_text(text):
    """
    Clean Amharic text by removing unwanted characters and normalizing
    """
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Normalize Unicode characters
    text = unicodedata.normalize('NFC', text)
    
    return text.strip()

def tokenize_amharic(text):
    """
    Simple tokenization for Amharic text
    """
    # Split by whitespace and punctuation
    tokens = re.findall(r'[\w]+|[።፣፤፥፦፧፨]', text)
    return tokens


In [None]:
## 3. Text Preprocessing Pipeline


In [None]:
def preprocess_text(text):
    """
    Complete preprocessing pipeline
    """
    # Clean text
    cleaned_text = clean_amharic_text(text)
    
    # Tokenize
    tokens = tokenize_amharic(cleaned_text)
    
    return tokens

# Example usage
sample_text = "ሰላም! ይህ ስልክ በ1000 ብር ይሸጣል። አዲስ አበባ ውስጥ ይገኛል።"
processed = preprocess_text(sample_text)
print("Original:", sample_text)
print("Processed:", processed)


In [None]:
def export_for_labeling(data, output_path):
    """
    Export preprocessed data in a format suitable for manual labeling
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for text in data:
            tokens = preprocess_text(text)
            for token in tokens:
                f.write(f"{token}\tO\n")  # Default to 'O' (Outside) label
            f.write("\n")  # Empty line between sentences

# Example export
# export_for_labeling(raw_messages, '../data/labeled/unlabeled_data.conll')
