In [38]:
import torch


import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import BertTokenizer, BertForTokenClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [3]:
ENTITY_TYPES = ["O", "B-Person", "I-Person", "B-Organization", "I-Organization", "B-Location", "I-Location", "B-Date", "I-Date", "B-Event", "I-Event"]

In [40]:
def prepare_dataset(file_name):

    sentences = []
    tags = []
    full_sentences = []

    with open(file_name, "r") as f:

        current_sentence = []
        current_tags = []

        for line in f:
            line = line.strip()

            # Check for empty line, indicating the end of a sentence
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    tags.append(current_tags)
                    full_sentences.append(" ".join(current_sentence))
                    current_sentence = []
                    current_tags = []
            else:
                word, tag = line.split()
                current_sentence.append(word)
                current_tags.append(ENTITY_TYPES.index(tag))

        # Append the last sentence if the file does not end with a newline
        if current_sentence:
            sentences.append(current_sentence)
            tags.append(current_tags)

    formatted_data = {"sentence": sentences, "tags": tags}
    return Dataset.from_dict(formatted_data)


# df = pd.DataFrame({'full_sentence': full_sentences,'sentence': sentences, 'tags': tags})


In [58]:
def subset_to_huggingface_dataset(subset):
    # Extract data from the subset
    data = [subset.dataset[idx] for idx in subset.indices]

    # Convert the list of dictionaries to a Hugging Face Dataset
    return Dataset.from_list(data)

In [65]:
train_dataset = prepare_dataset("EverestNER-train-bio.txt")

test_dataset = prepare_dataset("EverestNER-test-bio.txt")


from torch.utils.data import random_split

# Split the train dataset into train and validation datasets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, validation_subset = random_split(train_dataset, [train_size, val_size])

train_dataset = subset_to_huggingface_dataset(train_subset)
validation_dataset = subset_to_huggingface_dataset(validation_subset)

torch.save(train_dataset, 'train_dataset.pt')
torch.save(validation_dataset, 'validation_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')





In [None]:
train_dataset = torch.load("train_dataset.pt")
validatioin_dataset = torch.load("validation_dataset.pt")
test_dataset = torch.load("test_dataset.pt")

In [None]:
ner_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [45]:
def tokenize_and_align_tags(samples):
    tokenized_inputs = tokenizer(
        samples["sentence"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(samples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [80]:
nep_model = AutoModelForTokenClassification.from_pretrained("NepBERTa/NepBERTa", from_tf=True, num_labels=len(ENTITY_TYPES))
nep_tokenizer = AutoTokenizer.from_pretrained("NepBERTa/NepBERTa", model_max_length=512)


multi_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
multi_model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(ENTITY_TYPES))

All TF 2.0 model weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
tokenizer = multi_tokenizer
model = multi_model

tokenized_datasets = ner_datasets.map(tokenize_and_align_tags, batched=True)

Map:   0%|          | 0/11078 [00:00<?, ? examples/s]

Map:   0%|          | 0/2770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1950 [00:00<?, ? examples/s]

In [74]:
print(len(tokenized_datasets['train'][0]['input_ids']))

print(len(tokenized_datasets['train'][0]['attention_mask']))

474
474


In [89]:
tokenizer = nep_tokenizer
model = nep_model

tokenized_datasets = ner_datasets.map(tokenize_and_align_tags, batched=True)

Map:   0%|          | 0/11078 [00:00<?, ? examples/s]

Map:   0%|          | 0/2770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1950 [00:00<?, ? examples/s]

In [1]:



def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }
    return data


def main(model, tokenized_datasets, n_epochs, learning_rate):


    training_args = TrainingArguments(
        output_dir="./saved_model_nepali",
        eval_strategy="steps",
        eval_steps=500,
        save_steps=500,
        num_train_epochs=n_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_steps=100,
        learning_rate=learning_rate,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Using current device:", training_args.device)

    trainer.train()


In [2]:
tokenizer = nep_tokenizer
model = nep_model

tokenized_datasets = ner_datasets.map(tokenize_and_align_tags, batched=True)
main(model, tokenized_datasets, 5, 5e-5)

NameError: name 'nep_tokenizer' is not defined

In [10]:
learning_rate = 1e-5
batch_size = 32
num_epochs = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


model = multi_model.to(device)
tokenizer = multi_tokenizer

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

optimizer = AdamW(model.parameters(), lr=learning_rate)
clip_value = 1.0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)
        # Unpack the tuple
        outputs = model(inputs, labels=labels)
        loss =  outputs.loss

        if torch.isnan(loss).any():
          print("NaN value detected in loss!")
          break

        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")


model.save_pretrained('fine_tuned_ner_model_nepberta')
tokenizer.save_pretrained('fine_tuned_ner_model_nepberta')



Using device: cpu


Training:   0%|          | 0/433 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


: 