In [None]:
%pip install torch torchvision torchaudio
%pip install transformers datasets tokenizers seqeval -q
%pip install --upgrade pip
%pip install transformers[torch]
%pip install accelerate -U
%pip install pandas

In [None]:
import datasets
import json
import csv
import numpy as np 
import pandas as pd
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline
from statistics import mode
from time import time
from tqdm import tqdm

In [None]:
# Read in data
train_raw = pd.read_csv("train.csv", skip_blank_lines=False)
validation_raw = pd.read_csv("validation.csv", skip_blank_lines=False)
test_raw = pd.read_csv("test.csv", skip_blank_lines=False)
train_ready = []
validation_ready = []
test_ready = []

# Format training input for BERT
count = 0
tokens = []
ner_tags = []
for _, row in train_raw.iterrows():
    if (pd.isna(row['id'])):
        train_ready.append(
            {
                "id": id,
                "tokens": [tokens],
                "ner_tags": [ner_tags]
            }
        )
        count += 1
        tokens = []
        ner_tags = []
        continue
    id = str(count)
    tokens.append(row['word'])
    ner_tags.append(int(row['label']))
train_ready.append(
    {
        "id": id,
        "tokens": [tokens],
        "ner_tags": [ner_tags]
    }
)

# Format validation input for BERT
count = 0
tokens = []
ner_tags = []
for _, row in validation_raw.iterrows():
    if (pd.isna(row['id'])):
        validation_ready.append(
            {
                "id": id,
                "tokens": [tokens],
                "ner_tags": [ner_tags]
            }
        )
        count += 1
        tokens = []
        ner_tags = []
        continue
    id = str(count)
    tokens.append(row['word'])
    ner_tags.append(int(row['label']))
validation_ready.append(
    {
        "id": id,
        "tokens": [tokens],
        "ner_tags": [ner_tags]
    }
)

# Format testing input for BERT
count = 0
tokens = []
ner_tags = []
for _, row in test_raw.iterrows():
    if (pd.isna(row['id'])):
        test_ready.append(
            {
                "id": id,
                "tokens": [tokens]
            }
        )
        count += 1
        tokens = []
        continue
    id = str(count)
    tokens.append(row['word'])
test_ready.append(
    {
        "id": id,
        "tokens": [tokens]
    }
)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-large-cased", ) 

In [None]:
# Tokenizer function from the tutorial
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.
                     
    label_all_tokens (bool): A flag to indicate whether all tokens should have labels. 
                             If False, only the first token of a word will have a label, 
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [None]:
# Tokenize training data
tokenized_datasets = []
for sent in train_ready:
    tokenized_datasets.append(tokenize_and_align_labels(sent))

for i, row in enumerate(tokenized_datasets):
    tokenized_datasets[i]['input_ids'] = (row['input_ids'])[0]
    tokenized_datasets[i]['attention_mask'] = (row['attention_mask'])[0]
    tokenized_datasets[i]['token_type_ids'] = (row['token_type_ids'])[0]
    tokenized_datasets[i]['labels'] = (row['labels'])[0]

# Tokenize validation data
tokenized_datasets_validation = []
for sent in validation_ready:
    tokenized_datasets_validation.append(tokenize_and_align_labels(sent))

for i, row in enumerate(tokenized_datasets_validation):
    tokenized_datasets_validation[i]['input_ids'] = (row['input_ids'])[0]
    tokenized_datasets_validation[i]['attention_mask'] = (row['attention_mask'])[0]
    tokenized_datasets_validation[i]['token_type_ids'] = (row['token_type_ids'])[0]
    tokenized_datasets_validation[i]['labels'] = (row['labels'])[0]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-large-cased", num_labels=21)

In [None]:
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=20, 
weight_decay=0.01, 
) 

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [None]:
metric = datasets.load_metric("seqeval") 

In [None]:
label_list = ['B-company', 'B-facility', 'B-geo-loc', 'B-movie', 'B-musicartist', 'B-other', 'B-person', 'B-product', 'B-sportsteam', 'B-tvshow', 'I-company', 'I-facility', 'I-geo-loc', 'I-movie', 'I-musicartist', 'I-other', 'I-person', 'I-product', 'I-sportsteam', 'I-tvshow', 'O']

In [None]:
# Metrics computing function from the tutorial
def compute_metrics(eval_preds): 
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets, 
   eval_dataset=tokenized_datasets_validation, 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
start_train = time()
trainer.train()
end_train = time()

In [None]:
# Training time
print((end_train - start_train)/60, "mins")

In [None]:
model.save_pretrained("ner_model")

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

tag_words = {}
out = open('ans.csv', 'w')
submit = csv.writer(out)
submit.writerow(['id', 'label'])
global_id = 0

start_test = time()
for row in tqdm(test_ready):
    # Join every test sentence into a string
    sent = " ".join(row['tokens'][0])
    intervals = []
    containers = []
    track = 0
    
    # Determine intervals spanned by each word
    for token in row['tokens'][0]:
        intervals.append((track, track + len(token)))
        containers.append([])
        track += len(token) + 1
    # Feed a sentence into the model for prediction
    ner_results = nlp(sent)
    # Map all predicted subwords with corresponding full words
    for index, word in enumerate(ner_results):
        for pos, interval in enumerate(intervals):
            if word['start'] >= interval[0] and word['end'] <= interval[1]:
                containers[pos].append(word['entity'])
    # Assign tags to words based on the majority of corresponding subword tags
    for j, container in enumerate(containers):
        if len(container) > 0:
            containers[j] = mode(container)
        else:
            containers[j] = 'O'
    
    # Form the output
    for word in containers:
        submit.writerow([global_id, label_list.index(word)])
        global_id += 1
out.close()
end_test = time()

In [None]:
# Testing time
print((end_test - start_test)/60, "mins")