# Import necessary libraries

In [28]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from seqeval.metrics import classification_report
from transformers import DataCollatorForTokenClassification

# Step 2: Load the dataset from CoNLL format


In [40]:
def load_conll(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            if line.strip():  # If line is not empty
                word, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
            else:
                if sentence:  # When reaching an empty line, store the sentence and labels
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
        if sentence:  # Append the last sentence if not empty
            sentences.append(sentence)
            labels.append(label)
    return sentences, labels

In [41]:
# Load data from the .conll file
file_path = "../data/merged_amharic_ner_data.conll"
sentences, labels = load_conll(file_path)

# Step 3: Convert data into a DataFrame


In [42]:
df = pd.DataFrame({"Sentence": sentences, "Tags": labels})

# Step 4: Tokenization and Data Preparation


In [43]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')



In [44]:
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["Sentence"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(batch["Tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Step 5: Convert DataFrame to Hugging Face Dataset


In [None]:
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [46]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 650000/650000 [00:05<00:00, 123528.01 examples/s]
Generating test split: 100%|██████████| 50000/50000 [00:00<00:00, 176311.42 examples/s]


{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [47]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 650000/650000 [20:08<00:00, 538.00 examples/s]
Map: 100%|██████████| 50000/50000 [01:51<00:00, 448.54 examples/s]
