# Named Entity Recognition and Classification (NERC) - BERT

In [1]:
import utils

import pandas as pd

from typing import List, Dict, Union
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from datasets import Dataset as hf_Dataset

In [2]:
train_data_ner_file: str = r"C:\Users\jayde\OneDrive\School\Text Mining for AI\final_project_tm\data\train_data\NER-train.tsv"
test_data_ner_file: str = r"C:\Users\jayde\OneDrive\School\Text Mining for AI\final_project_tm\data\test_data\NER-test.tsv"

In [3]:
df = pd.read_csv(train_data_ner_file, sep="\t")

In [4]:
X, y = utils.gather_tokens_and_tags(df=df)

train_data: List[Dict[str,str]] = []
for tokens, ner_tags in zip(X, y):
    tokens = [str(token) for token in tokens]
    ner_tags = [str(ner_tag) for ner_tag in ner_tags]
    
    train_data.append({
        "tokens": tokens,
        "ner_tags": ner_tags
    })
    
dataset = hf_Dataset.from_list(train_data)

In [5]:
TOKENIZER = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
label_list = sorted(set(label for seq in y for label in seq))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [7]:
def preprocess_function(examples):
    tokenized_inputs = TOKENIZER(
        examples["tokens"],
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[labels[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8128 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)

In [10]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
output_dir = r"C:\Users\jayde\OneDrive\School\Text Mining for AI\project_text_mining\final_project_tm\models\bert_model"

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="no",
    eval_steps=250,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.4,
    logging_steps=100,
    save_steps=250,
    fp16=True
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=TOKENIZER
)

  trainer = Trainer(


In [22]:
def start_finetuning(trainer: Trainer):
    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete!")
    
start_finetuning(trainer=trainer)

Starting fine-tuning...


KeyboardInterrupt: 