In [3]:
import pandas as pd

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Function to tokenize the Amharic text (a simple split based on space)
def tokenize_message(message):
    # Convert to string to avoid issues with float/NaN values
    if isinstance(message, str):
        return message.split()
    else:
        return []

# Manual annotation function
def annotate_message(tokens):
    labeled_tokens = []
    print("\nStart labeling each token:")
    for token in tokens:
        print(f"Token: {token}")
        label = input("Enter label (B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O): ")
        labeled_tokens.append((token, label))
    return labeled_tokens

# Save the labeled data in CoNLL format
def save_to_conll(labeled_data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for message in labeled_data:
            for token, label in message:
                f.write(f"{token} {label}\n")
            f.write("\n")  # Blank line to separate messages

# Main function to annotate dataset
def main(file_path, output_file, start_row, num_rows):
    df = load_dataset(file_path)

    labeled_data = []

    # Subset the dataframe based on the start_row and num_rows
    df_subset = df.iloc[start_row : start_row + num_rows]

    # Assuming the dataset has a 'Message' column
    for index, row in df_subset.iterrows():
        message = row['Message']

        # Tokenize the message (handle non-string cases)
        tokens = tokenize_message(message)

        if tokens:  # Skip messages that couldn't be tokenized (empty or non-string)
            print(f"\nMessage {index + 1}: {message}")

            # Annotate tokens
            labeled_tokens = annotate_message(tokens)

            # Append labeled tokens
            labeled_data.append(labeled_tokens)

    # Save the annotated data in CoNLL format
    save_to_conll(labeled_data, output_file)
    print(f"\nAnnotated data saved to {output_file}")

if __name__ == "__main__":
    # Provide the path to the dataset and the output file
    dataset_path = "../data/cleaned_dataset.csv"
    output_file = "../data/labeled_data.txt"

    # Specify the starting row and number of rows to process
    start_row = int(input("Enter the starting row: "))
    num_rows = int(input("Enter the number of rows to label: "))

    main(dataset_path, output_file, start_row, num_rows)



Message 2: Saachi Electric Kettle 
 የውሀ ማፍያ ቦይለር 
 1.8ሊትር የሆነ 
 2200W 
 Automatic switch off 
 ውስጡ Stainless steel የሆነ 
  ለአጠቃቀም በጣም ቀላል 

                 በ 
            2700 


 

 
                      

 


 

አዲስ አበባ ዉስጥ ከ100ብር እስከ 200ብር ብቻ በማስከፈል ያሉበት ድረስ በፈጣን ሞተረኞቻችን እንልክልዏታለን። 

አድራሻ=ቁጥር 1 = ጉርድሾላ ከሴንቸሪ ሞል ትንሽ ዝቅ እንዳሉ ሆሊሲቲ ሴንተር ላይ እንደገቡ ፊትለፊት ከሊፍቱ በግራ በኩል  ሚዛን ላይ M06
           ቁጥር 2 = ጀሞ መስታወትፋብሪካ ፊትለፊት ራሐ ሞል ግራዉንድ ፍሎር ከደረጃዉ ጎን። 

          በሞደርን እቃወዏች ሂወትዎን
                   ሞደርናይዝ ያድርጉ

Start labeling each token:
Token: Saachi
Token: Electric
Token: Kettle
Token: የውሀ
Token: ማፍያ
Token: ቦይለር
Token: 1.8ሊትር
Token: የሆነ
Token: 2200W
Token: Automatic
Token: switch
Token: off
Token: ውስጡ
Token: Stainless
Token: steel
Token: የሆነ
Token: ለአጠቃቀም
Token: በጣም
Token: ቀላል
Token: በ
Token: 2700
Token: አዲስ
Token: አበባ
Token: ዉስጥ
Token: ከ100ብር
Token: እስከ
Token: 200ብር
Token: ብቻ
Token: በማስከፈል
Token: ያሉበት
Token: ድረስ
Token: በፈጣን
Token: ሞተረኞቻችን
Token: እንልክልዏታለን።
Token: አድራሻ=ቁጥር
Token: 1
Token: =
Tok