# import

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

import pandas as pd
from tokenizer import AmharicTokenizer
from rule_labeler import RuleBasedNER
from labeler import CoNLLLabeler

# Load cleaned messages

In [2]:
df = pd.read_csv('../Data/processed/clean_scraped.csv')

# Sample 50 unique messages

In [3]:
sample_texts = df['text'].drop_duplicates().sample(50, random_state=42).tolist()

# Initialize components

In [4]:
tokenizer = AmharicTokenizer()

In [5]:
ner = RuleBasedNER()

In [6]:
labeler = CoNLLLabeler()

# Loop through each message, tokenize and label

In [7]:
for text in sample_texts:
    tokens = tokenizer.tokenize(text)
    labels = ner.label_tokens(tokens)
    labeler.add_labeled_message(tokens, labels)

# Save all labeled messages to CoNLL format

In [8]:
output_path = '../Data/processed/ner_dataset.conll'
labeler.save_to_file(output_path)
print(f" => 50 messages labeled and saved to {output_path} ")

 => 50 messages labeled and saved to ../Data/processed/ner_dataset.conll 


# *******  Verify the Output   *******

In [9]:
# Check labeled entity counts
from collections import Counter

all_labels = []
for text in sample_texts:
    tokens = tokenizer.tokenize(text)
    labels = ner.label_tokens(tokens)
    all_labels.extend(labels)

Counter(all_labels)

Counter({'O': 1280,
         'B-LOC': 211,
         'I-LOC': 132,
         'B-Product': 80,
         'I-Product': 15})