# Downloading neccessary libraries

In [1]:
!pip install transformers datasets seqeval
3 fr

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [3

# Loading dataset

In [6]:
import pandas as pd

def parse_conll(file_path):
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence, label = [], []
        for line in file:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
            else:
                # Split into word and tag, handling extra spaces
                parts = line.strip().rsplit(maxsplit=1)
                if len(parts) == 2:  # Ensure there are exactly two components
                    word, tag = parts
                    sentence.append(word)
                    label.append(tag)
                else:
                    raise ValueError(f"Unexpected line format: {line.strip()}")
    return sentences, labels

sentences, labels = parse_conll("@mertteka_labeled_data.conll")



# Tokenization

In [16]:
from transformers import AutoTokenizer

model_name = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)
    aligned_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels.append([-100 if word_id is None else label[word_id] for word_id in word_ids])

    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

tokenized_dataset = tokenize_and_align_labels(sentences, labels)


tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/274k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/725k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

# setting up training argument

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
)




In [78]:
# Replace all occurrences of 'B_PRODUCT' with 'B-PRODUCT' only if the label is a string
def replace_product_labels(tokenized_data):
    for i, sentence_labels in enumerate(tokenized_data['labels']):
        tokenized_data['labels'][i] = [
            label.replace('B_PRODUCT', 'B-PRODUCT') if isinstance(label, str) and 'B_PRODUCT' in label else label
            for label in sentence_labels
        ]
    return tokenized_data

# Function to convert 'o' to 'O' while leaving other labels unchanged
def convert_o_to_O(tokenized_data):
    for i, sentence_labels in enumerate(tokenized_data['labels']):
        tokenized_data['labels'][i] = [
            'O' if isinstance(label, str) and label == 'o' else label  # Convert 'o' to 'O' if it is a string
            for label in sentence_labels
        ]
    return tokenized_data

# Function to convert integer 0 to 'O'
def convert_zero_to_O(tokenized_data):
    for i, sentence_labels in enumerate(tokenized_data['labels']):
        tokenized_data['labels'][i] = [
            'O' if label == 0 else label  # Convert only integer 0 to 'O'
            for label in sentence_labels
        ]
    return tokenized_data

# Apply the functions to your tokenized datasets
tokenized_train = replace_product_labels(tokenized_train)
tokenized_train = convert_o_to_O(tokenized_train)
tokenized_train = convert_zero_to_O(tokenized_train)

tokenized_val = replace_product_labels(tokenized_val)
tokenized_val = convert_o_to_O(tokenized_val)
tokenized_val = convert_zero_to_O(tokenized_val)

tokenized_test = replace_product_labels(tokenized_test)
tokenized_test = convert_o_to_O(tokenized_test)
tokenized_test = convert_zero_to_O(tokenized_test)

# Inspect the unique labels in your tokenized dataset
unique_labels = set(label for sentence_labels in tokenized_train['labels'] for label in sentence_labels)
print("Unique labels in the dataset:", unique_labels)


Unique labels in the dataset: {'B-PRODUCT', 'I-PRICE', 'B-LOC', 'O', 'I-PRODUCT', 'B-PRICE', 'I-LOC', -100}


In [80]:
label_list = [
    "O",          # Outside of a named entity
    "B-PRODUCT",  # Beginning of a product entity
    "I-PRODUCT",  # Inside a product entity
    "B-LOC", # Beginning of a location entity
    "I-LOC", # Inside a location entity
    "B-PRICE",    # Beginning of a price entity
    "I-PRICE"     # Inside a price entity
]

num_labels = len(label_list)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}



In [82]:
# Define the label2id mapping
label2id = {
    'O': 0,
    'B-PRODUCT': 1,
    'I-PRODUCT': 2,
    'B-PRICE': 3,
    'I-PRICE': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    # Add other labels here as necessary
}

# Function to convert string labels to integers
def convert_labels_to_int(tokenized_data, label2id):
    for i, sentence_labels in enumerate(tokenized_data['labels']):
        tokenized_data['labels'][i] = [
            label2id[label] if isinstance(label, str) and label in label2id else label
            for label in sentence_labels
        ]
    return tokenized_data

# Apply the conversion functions
tokenized_train = convert_labels_to_int(tokenized_train, label2id)
tokenized_val = convert_labels_to_int(tokenized_val, label2id)
tokenized_test = convert_labels_to_int(tokenized_test, label2id)

# Now the labels are integers, and we can proceed with training

# Inspect the unique labels in your tokenized dataset
unique_labels = set(label for sentence_labels in tokenized_train['labels'] for label in sentence_labels)
print("Unique labels in the dataset:", unique_labels)


Unique labels in the dataset: {0, 1, 2, 3, 4, 5, 6, -100}


In [83]:

# Convert to Dataset objects
train_dataset = Dataset.from_dict(tokenized_train)
val_dataset = Dataset.from_dict(tokenized_val)
test_dataset = Dataset.from_dict(tokenized_test)

# Fine tuning the model

In [84]:

# Apply the conversion to train, validation, and test datasets
tokenized_train = convert_labels_to_int(tokenized_train, label2id)
tokenized_val = convert_labels_to_int(tokenized_val, label2id)


# Convert to Dataset objects
train_dataset = Dataset.from_dict(tokenized_train)
val_dataset = Dataset.from_dict(tokenized_val)


In [None]:
from transformers import AutoModelForTokenClassification, Trainer
from datasets import DatasetDict

# Define model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

# Define dataset in DatasetDict format
dataset = DatasetDict({
    "train":  tokenized_train,
    "validation": tokenized_val,
})

# Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at rasyosef/bert-tiny-amharic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

# Evaluate the model

In [None]:
from seqeval.metrics import classification_report

# Predict
predictions, labels, _ = trainer.predict(dataset["validation"])
predictions = np.argmax(predictions, axis=2)

# Map predictions and labels to entities
true_labels = [[label_list[l] for l in label] for label in labels]
predicted_labels = [[label_list[p] for p in prediction] for prediction in predictions]

# Evaluate
print(classification_report(true_labels, predicted_labels))


# save it

In [None]:
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")
