<a href="https://colab.research.google.com/github/jkama4/project_text_mining/blob/jayden-branch/final_project_tm/nerc_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition and Classification (NERC) - BERT

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=780571429eced5e16811384f22ed132af373e563531ea255be3579999f135a1c
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import csv
import spacy
import re
import nltk
import pandas as pd

from typing import Set, List, Dict, Tuple

from datasets import load_dataset, load_metric

NLP = spacy.load("en_core_web_sm")


def word_shape(word: str):
    shape = re.sub("[A-Z]", "X", word)
    shape = re.sub("[a-z]", "x", shape)
    shape = re.sub("[0-9]", "d", shape)
    shape = re.sub(r"\W", "w", shape)
    return shape


def gather_test_bio_ner_tags(file_name: str) -> Set[str]:
    bio_ner_tags = set()
    with open(file_name, "r") as f:
        data = csv.DictReader(f=f, delimiter="\t")
        for row in data:
            bio_ner_tags.add(row["bio_ner_tag"])

    return bio_ner_tags


def nerc_data_to_file(raw_data: Dataset, file_name: str):
    try:
        with open(file_name, "w", newline="", encoding="utf-8") as f:
            writer: csv.writer = csv.writer(f, delimiter="\t")
            writer.writerow(["sentence_id", "token_id", "token", "bio_ner_tag"])

            for idx, sent in enumerate(raw_data):
                sentence = " ".join(sent["tokens"])
                doc = NLP(sentence)

                bio_tags = ["O"] * len(doc)

                for ent in doc.ents:
                    bio_tags[ent.start] = f"B-{ent.label_}"
                    for i in range(ent.start + 1, ent.end):
                        bio_tags[i] = f"I-{ent.label_}"

                for token_id, token in enumerate(doc):
                    writer.writerow([idx, token_id, token.text, bio_tags[token_id]])

            f.close()

        print("Converted successfully!")
    except Exception as e:
        return {"error": str(e)}


def gather_tokens_and_tags(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    X, y = [], []

    sent_tokens = []
    sent_tags = []

    for token, tag in zip(df["token"], df["bio_ner_tag"]):
        sent_tokens.append(token)
        sent_tags.append(tag)

        if token in [".", "!", "?"]:
            X.append(sent_tokens)
            y.append(sent_tags)
            sent_tokens = []
            sent_tags = []

    if sent_tokens:
        X.append(sent_tokens)
        y.append(sent_tags)

    return X, y



def sentiment_data_to_file(raw_data: Dataset, file_name: str):
    try:
        with open(file_name, "w", newline="", encoding="utf-8") as f:
            writer: csv.writer = csv.writer(f, delimiter="\t")
            writer.writerow(["sentence_id", "sentence", "sentiment"])

            for idx, elem in enumerate(raw_data):
                sentence = elem["sentence"]
                label = "positive" if elem["label"] == 1 else "negative"

                writer.writerow([idx, sentence, label])

        f.close()
        print("Converted successfully!")
    except Exception as e:
        return {"message": str(e)}


def topic_data_to_file(raw_data: Dataset, file_path: str):
    with open(file_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter = "\t")
        writer.writerow(["id", "question", "category"])
        for entry in raw_data:
            id_ = entry["id"]
            q = entry["question"]
            category = entry["category"]

            if category == "movies":
                category = "movie"
            elif category == "books":
                category = "book"

            writer.writerow([id_, q, category])

        f.close()


def extract_features(sentence, pos_tags, i):
    word = sentence[i]
    pos = pos_tags[i]

    if not isinstance(word, str):
        word = str(word)

    features = {
        "bias": 1.0,
        "word.lower()": word.lower(),
        "word[-3:]": word[-3:],
        "word[-2:]": word[-2:],
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
        "pos": pos,
        "word.shape": word_shape(word=word)

    }
    if i > 0:
        word1 = sentence[i-1]
        pos1 = pos_tags[i-1]

        if not isinstance(word1, str):
            word1 = str(word1)

        features.update({
            "-1:word.lower()": word1.lower(),
            "-1:word.istitle()": word1.istitle(),
            "-1:word.isupper()": word1.isupper(),
            "-1:pos": pos1,
            "-1:word.shape": word_shape(word=word1)
        })
    else:
        features["BOS"] = True

    if i < len(sentence) - 1:
        word1 = sentence[i+1]
        pos1 = pos_tags[i+1]

        if not isinstance(word1, str):
            word1 = str(word1)

        features.update({
            "+1:word.lower()": word1.lower(),
            "+1:word.istitle()": word1.istitle(),
            "+1:word.isupper()": word1.isupper(),
            "+1:pos": pos1,
            "+1:pos": word_shape(word=word1)
        })
    else:
        features["EOS"] = True

    return features

def sentence_to_features(sentence):
    cleaned_sentence = [str(token) if not isinstance(token, str) else token for token in sentence]
    pos_tags = [pos for _, pos in nltk.pos_tag(cleaned_sentence)]
    return [extract_features(cleaned_sentence, pos_tags, i) for i in range(len(sentence))]



In [None]:
import pandas as pd
import numpy as np
import torch
import seqeval

from typing import List, Dict, Union
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from datasets import Dataset as hf_Dataset

In [None]:
train_data_ner_file: str = r"./NER-train.tsv"

In [None]:
df = pd.read_csv(train_data_ner_file, sep="\t")

In [None]:
X, y = gather_tokens_and_tags(df=df)

train_data: List[Dict[str,str]] = []
for tokens, ner_tags in zip(X, y):
    tokens = [str(token) for token in tokens]
    ner_tags = [str(ner_tag) for ner_tag in ner_tags]

    train_data.append({
        "tokens": np.asarray(tokens),
        "ner_tags": np.asarray(ner_tags)
    })

dataset = hf_Dataset.from_list(train_data)

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
label_list = sorted(set(label for seq in y for label in seq))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
def preprocess_function(examples):
    tokenized_inputs = TOKENIZER(
        examples["tokens"],
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding="max_length",
        truncation=True
    )

    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[labels[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8128 [00:00<?, ? examples/s]

In [None]:
# tokenized_dataset.set_format(
#     type="torch",
#     columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
# )

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
output_dir = r"./bert_model"

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    # evaluation_strategy="no",
    eval_steps=250,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.4,
    logging_steps=100,
    save_steps=250,
    fp16=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    processing_class=TOKENIZER
)

In [None]:
print(tokenized_dataset[0])

{'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'ner_tags': ['B-ORG', 'O', 'B-NORP', 'O', 'O', 'O', 'B-NORP', 'O', 'O'], 'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
def start_finetuning(trainer: Trainer):
    print("Starting fine-tuning...")
    trainer.train()
    print("Fine-tuning complete!")

start_finetuning(trainer=trainer)

Starting fine-tuning...


Step,Training Loss
100,1.4403
200,1.1191
300,0.9921
400,0.8789
500,0.8398
600,0.7852
700,0.7716
800,0.7262
900,0.6868
1000,0.6763


Fine-tuning complete!


In [None]:
trainer.state.log_history

[{'loss': 1.4403,
  'grad_norm': 2.0926363468170166,
  'learning_rate': 1.937007874015748e-05,
  'epoch': 0.0984251968503937,
  'step': 100},
 {'loss': 1.1191,
  'grad_norm': 9.712570190429688,
  'learning_rate': 1.8713910761154856e-05,
  'epoch': 0.1968503937007874,
  'step': 200},
 {'loss': 0.9921,
  'grad_norm': 4.720882892608643,
  'learning_rate': 1.8057742782152232e-05,
  'epoch': 0.2952755905511811,
  'step': 300},
 {'loss': 0.8789,
  'grad_norm': 4.758727073669434,
  'learning_rate': 1.740157480314961e-05,
  'epoch': 0.3937007874015748,
  'step': 400},
 {'loss': 0.8398,
  'grad_norm': 4.782834053039551,
  'learning_rate': 1.6745406824146985e-05,
  'epoch': 0.4921259842519685,
  'step': 500},
 {'loss': 0.7852,
  'grad_norm': 3.4453015327453613,
  'learning_rate': 1.608923884514436e-05,
  'epoch': 0.5905511811023622,
  'step': 600},
 {'loss': 0.7716,
  'grad_norm': 4.399016857147217,
  'learning_rate': 1.5433070866141734e-05,
  'epoch': 0.6889763779527559,
  'step': 700},
 {'loss

In [None]:
test_data_path = "./NER-test.tsv"

In [None]:
test_df: pd.DataFrame = pd.read_csv(test_data_path, sep="\t")

In [None]:
test_df.head()

Unnamed: 0,sentence_id,token_id,token,bio_ner_tag
0,0,0,If,O
1,0,1,you're,O
2,0,2,visiting,O
3,0,3,Paris,B-LOCATION
4,0,4,",",O


In [None]:
X_test, y_test = gather_tokens_and_tags(df=test_df)

test_data: List[Dict[str,str]] = []
for tokens, ner_tags in zip(X_test, y_test):
    tokens = [str(token) for token in tokens]
    ner_tags = [str(ner_tag) for ner_tag in ner_tags]

    test_data.append({
        "tokens": np.asarray(tokens),
        "ner_tags": np.asarray(ner_tags)
    })

test_dataset = hf_Dataset.from_list(test_data)

In [None]:
def test_preprocess_function(examples):
    tokenized_inputs = TOKENIZER(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label = label_seq[word_idx]
                label_id = label_to_id.get(label, -100)  # fallback if label is unknown
                label_ids.append(label_id)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
tokenized_test_dataset = test_dataset.map(
    lambda examples: test_preprocess_function(examples),
    batched=True
)


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [None]:
tokenized_test_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 15
})

In [None]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
id_to_label = {v: k for k, v in label_to_id.items()}

In [None]:
eval_trainer = Trainer(
    model=model,
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics
)

  eval_trainer = Trainer(


In [None]:
results = eval_trainer.evaluate(tokenized_test_dataset)
print(results)

{'eval_loss': 1.0500532388687134, 'eval_model_preparation_time': 0.0155, 'eval_precision': 0.21052631578947367, 'eval_recall': 0.15384615384615385, 'eval_f1': 0.17777777777777778, 'eval_accuracy': 0.7914691943127962, 'eval_runtime': 0.1613, 'eval_samples_per_second': 93.004, 'eval_steps_per_second': 12.401}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
predictions, labels, _ = eval_trainer.predict(tokenized_test_dataset)
pred_labels = np.argmax(predictions, axis=2)

for i in range(5):  # show 5 examples
    tokens = test_dataset[i]["tokens"]
    preds = [id_to_label[pred] for pred, label in zip(pred_labels[i], labels[i]) if label != -100]
    golds = [id_to_label[label] for pred, label in zip(pred_labels[i], labels[i]) if label != -100]

    print(f"TOKENS : {tokens}")
    print(f"PRED   : {preds}")
    print(f"GOLD   : {golds}")
    print("-" * 50)


TOKENS : ['If', "you're", 'visiting', 'Paris', ',', 'make', 'sure', 'to', 'see', 'the', 'Louvre', ',', 'as', 'they', 'exhibit', 'the', 'Mona', 'Lisa', '!']
PRED   : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
GOLD   : ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'O']
--------------------------------------------------
TOKENS : ['Amazon', ',', 'Google', 'and', 'Meta', 'control', 'a', 'huge', 'share', 'of', 'the', 'technology', 'market', 'globally', '.']
PRED   : ['O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
GOLD   : ['B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------------------------------------------------
TOKENS : ['Did', 'you', 'hear', 'Pharoah', 'Sanders', 'recorded', 'an', 'album', 'with', 'Floating', 'Points', '?']
PRED   : ['O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 