# Made by Kyrylo Krocha

### Some imports

In [67]:
#!pip install -r "D:\\InterviewProject\\Task1\\requirements.TXT"

In [3]:
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
import accelerate

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

### Loading dataset from bio format

In [8]:
def load_bio_dataset_to_dataframe(file_path):
    data = []
    sentence_id = 0

    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if not line:  # End of a sentence
                sentence_id += 1
            else:
                token, label = line.split()
                data.append({"sentence_id": sentence_id, "token": token, "label": label})

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Load dataset into a DataFrame
file_path = "D:\\InterviewProject\\Task1\\dataset\\improved_data.txt"  # Replace with your file path
df = load_bio_dataset_to_dataframe(file_path)

# Display the DataFrame
print(df)

      sentence_id     token       label
0               0     Mount  B-MOUNTAIN
1               0   Everest  I-MOUNTAIN
2               0        is           O
3               0       the           O
4               0   highest           O
...           ...       ...         ...
1660          104  Northern           O
1661          104       and           O
1662          104  Southern           O
1663          104     India  B-MOUNTAIN
1664          104         .           O

[1665 rows x 3 columns]


### Giving labels an integer representation 

In [11]:
labels = df['label'].unique().tolist()
labels = [s.strip() for s in labels ]
labels

['B-MOUNTAIN', 'I-MOUNTAIN', 'O']

In [13]:
for key, value in enumerate(labels):
    print(value, key)

B-MOUNTAIN 0
I-MOUNTAIN 1
O 2


In [15]:
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}

label2id={label:id for id,label in enumerate(labels)}


In [17]:
label2id

{'B-MOUNTAIN': 0, 'I-MOUNTAIN': 1, 'O': 2}

In [19]:
id2label

{0: 'B-MOUNTAIN', 1: 'I-MOUNTAIN', 2: 'O'}

In [21]:
df.head()

Unnamed: 0,sentence_id,token,label
0,0,Mount,B-MOUNTAIN
1,0,Everest,I-MOUNTAIN
2,0,is,O
3,0,the,O
4,0,highest,O


In [23]:
df["ids"]=df.label.map(lambda x: label2id[x.strip()])
df

Unnamed: 0,sentence_id,token,label,ids
0,0,Mount,B-MOUNTAIN,0
1,0,Everest,I-MOUNTAIN,1
2,0,is,O,2
3,0,the,O,2
4,0,highest,O,2
...,...,...,...,...
1660,104,Northern,O,2
1661,104,and,O,2
1662,104,Southern,O,2
1663,104,India,B-MOUNTAIN,0


### Tokenize and allign labels

In [26]:
from sklearn.model_selection import train_test_split

def prepare_data(df):
    sentences = df.groupby("sentence_id")["token"].apply(list).tolist()
    labels = df.groupby("sentence_id")["label"].apply(list).tolist()
    return sentences, labels

# Prepare tokens and labels
sentences, labels = prepare_data(df)

# Split the dataset into train and test sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)


In [28]:
from transformers import AutoTokenizer

# Load BERT tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and align labels
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to word indices
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_id != previous_word_id:
                label_ids.append(label2id[label[word_id]])  # Assign label to first subword
            else:
                label_ids.append(-100)  # Ignore other subword parts
            previous_word_id = word_id
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Tokenize train and test datasets
train_inputs = tokenize_and_align_labels(train_sentences, train_labels)
test_inputs = tokenize_and_align_labels(test_sentences, test_labels)

### Loading pre-trained BERT model

In [31]:
from transformers import AutoModelForTokenClassification

# Load pre-trained BERT model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id)  # Number of unique labels
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Setting up trainning of the model

In [34]:
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
import accelerate
import transformers



class NERDataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.inputs.items()}

# Create DataLoader-compatible datasets
train_dataset = NERDataset(train_inputs)
test_dataset = NERDataset(test_inputs)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.316835
2,0.526200,0.206461
3,0.526200,0.171234


TrainOutput(global_step=18, training_loss=0.40284520387649536, metrics={'train_runtime': 63.5266, 'train_samples_per_second': 3.967, 'train_steps_per_second': 0.283, 'total_flos': 3601028485152.0, 'train_loss': 0.40284520387649536, 'epoch': 3.0})

### Save the model

In [37]:
model.save_pretrained("./fine_tuned_bert_ner")
tokenizer.save_pretrained("./fine_tuned_bert_ner")

('./fine_tuned_bert_ner\\tokenizer_config.json',
 './fine_tuned_bert_ner\\special_tokens_map.json',
 './fine_tuned_bert_ner\\vocab.txt',
 './fine_tuned_bert_ner\\added_tokens.json',
 './fine_tuned_bert_ner\\tokenizer.json')

### Evaluate the model

In [40]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()


# Predictions
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Convert IDs to tags
true_tags = [[id2label[label_id] for label_id in sentence if label_id != -100] for sentence in labels]
pred_tags = [[id2label[pred_id] for pred_id, label_id in zip(sentence, labels[i]) if label_id != -100] for i, sentence in enumerate(predictions)]

# Flatten the lists of true and predicted tags
flat_true_tags = [tag for sentence in true_tags for tag in sentence]
flat_pred_tags = [tag for sentence in pred_tags for tag in sentence]

# Print the classification report
print(classification_report(flat_true_tags, flat_pred_tags,zero_division=0))

              precision    recall  f1-score   support

  B-MOUNTAIN       0.84      0.92      0.88        39
  I-MOUNTAIN       0.91      0.62      0.74        16
           O       0.98      0.99      0.98       282

    accuracy                           0.96       337
   macro avg       0.91      0.84      0.87       337
weighted avg       0.96      0.96      0.96       337



### Perform inference

In [43]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
ner_pipeline = pipeline("ner", model=model, device=device,tokenizer=tokenizer, aggregation_strategy="simple")

# Test the pipeline
text = "Mount Everest is one of the tallest peaks in the world."
result = ner_pipeline(text)
print(result)

Device set to use cuda


[{'entity_group': 'LABEL_0', 'score': 0.4456879, 'word': 'mount everest', 'start': 0, 'end': 13}, {'entity_group': 'LABEL_2', 'score': 0.9871081, 'word': 'is one of the tallest peaks in the world.', 'start': 14, 'end': 55}]
