In [6]:
import pandas as pd
import regex as re

# Load your actual dataset 
df = pd.read_csv('clean_data.csv')

# Extract the 'Message' column
messages = df['Message'].dropna().tolist()  # Dropping NaN values in the 'Message' column if any

# Manually define some product and location keywords for labeling
product_keywords = ["መነሻዬ", "በቀለማት", "ያሸበረቁ", "የእንጨት", "ብሎኮች", "ታብሌት", "ለልጆች", "የሚሆኑ", "ብረትን", "እንደ", "ወረቀት", "እያጣጠፉ", "ውብ", "ዲዛይኖችን" ,"የሚሰሩባቸው" "ትምህርታዊ", "መጫዎቻዎችን" , "የሪሞት", "ኮንትሮል", "መኪኖች", "ሩቢክስ", "ኪዩብን"]
location_keywords = ["አዲስ", "አበባ", "ቦሌ", "ጉርድ", "ሾላ", "ሆሊ", "ሲቲ" "ሴንተር", "ፒያሳ", "ካንትሪ", "ታወር", "ጀሞ", "ደሊና", "ህንፃ", "ርጂ", "መብራት", "ሃይል"]
price_keywords = ["ዋጋ", "ብር", "1000"]


# Custom function to tokenize Amharic text using regex
def amharic_tokenizer(text):
    # Tokenize based on Ethiopic script, numbers, and punctuation
    tokens = re.findall(r'\p{Ethiopic}+|\d+|[^\w\s]', text)
    return tokens

# Labeling function for tokens
def label_tokens(tokens):
    labels = []
    for i, token in enumerate(tokens):
        if token in product_keywords:
            if i == 0 or tokens[i-1] not in product_keywords:
                labels.append(f"{token} B-Product")
            else:
                labels.append(f"{token} I-Product")
        elif token in location_keywords:
            if i == 0 or tokens[i-1] not in location_keywords:
                labels.append(f"{token} B-LOC")
            else:
                labels.append(f"{token} I-LOC")
        elif token in price_keywords:
            if i == 0 or tokens[i-1] not in price_keywords:
                labels.append(f"{token} B-PRICE")
            else:
                labels.append(f"{token} I-PRICE")
        else:
            labels.append(f"{token} O")
    return labels

# Process messages
def process_messages(messages):
    conll_output = []
    for message in messages:
        tokens = amharic_tokenizer(message)
        labeled_tokens = label_tokens(tokens)
        conll_output.append("\n".join(labeled_tokens))
        conll_output.append("")  # Empty line to separate messages
    return "\n".join(conll_output)

# Generate the CoNLL formatted output
conll_data = process_messages(messages)

# Save the output to a text file
with open("labeled_data.conll", "w", encoding="utf-8") as f:
    f.write(conll_data)

print("Labeled data saved in CoNLL format.")


Labeled data saved in CoNLL format.
