# Load Packages

In [None]:
 #install packages
!pip install transformers
!pip install datasets
!pip install seqeval
!pip install transformers[torch]
!pip install accelerate -U`

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from transformers import pipeline
from transformers import BertTokenizer
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
import json
from torch.utils.data import DataLoader
import re
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, BertTokenizerFast, BertForTokenClassification
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from tqdm import tqdm
from torch.optim import SGD

**READ AND PROCESS DATA**


# Read file and preprocess data

In [None]:
#read dataset and separate training set from testing set
def read_dataset(file):
  with open(file, "r") as json_file:
    data = json.load(json_file)
    train = data['train']
    test = data['test']
  return train, test

file = "label_data_train_test.json"
train, test = read_dataset(file)

**map each index range to the specific word in the training text**

In [None]:
index_to_words = []
for text, labels in train:
    result = []
    for ner, start, end in labels:
        word = text[start:end]
        result.append((ner, word))
    index_to_words.append(result)
print(index_to_words)

[[('ORG', 'GRI Club'), ('ORG', 'CANATUR'), ('ORG', 'the National Chamber of Tourism of Peru'), ('ORG', 'CANATUR')], [('ORG', 'TETRON COMMERCIAL LTD'), ('ORG', 'Tetron Commercial Ltd'), ('ORG', 'Tetron Commercial Ltd'), ('ORG', 'Tetron Commercial Ltd'), ('PERSON', 'Anubhav Poddar'), ('PERSON', 'Ashish Singhania'), ('ORG', 'Tetron Commercial Ltd'), ('ORG', 'Tetron Commercial Ltd'), ('ORG', 'Tetron Commercial Ltd')], [('PERSON', 'Bush'), ('PERSON', 'Haji Asad Khan Zarkari Mohammadhasni'), ('PERSON', 'Hermagoras Gonzalez Polanco'), ('PERSON', 'Cumhur Yakut'), ('PERSON', 'Marcos Arturo Beltran Leyva'), ('ORG', 'Beltran Leyva Organization'), ('ORG', 'ndrangheta'), ('ORG', 'PKK'), ('ORG', 'KGK'), ('ORG', 'Kongra-Gel'), ('ORG', "Kurdistan Workers' Party"), ('ORG', 'PKK'), ('PERSON', 'Gordon Johndroe'), ('ORG', 'the National Security Council'), ('ORG', 'PKK'), ('ORG', 'The Associated Press')], [('ORG', 'IST')], [('ORG', 'Bahamas Realty'), ('PERSON', 'George Damianos'), ('ORG', "Damianos Sotheby

# Distinguish B- and I- label (the position) for the labeled words

In [None]:
#distinguish B- and I- label (the position) for the labeled words
def identify_word_position(lst):
    result = []
    for item in lst:
        item_result = []
        for pair in item:
            words = pair[1].split()
            prefix = 'B-' + pair[0]
            item_result.append([prefix, words[0]])
            item_result.extend(['I-' + pair[0], word] for word in words[1:])
        result.append(item_result)
    return result



# Label All Training Texts

In [None]:
def standardize_label(dataset):
  for text, label in dataset:
    for i in range(len(label)):
      if label[i][0] == 'PERSON' or label[i] == 'Person':
        label[i][0] = 'PER'
  return dataset

train = standardize_label(train)
test = standardize_label(test)

In [None]:
def label_all_words(text, label_info):
    labeled_text = ['O'] * len(text.split())
    words = text.split()

    for label, start, end in label_info:

        start_word_index = 0
        end_word_index = 0
        char_cnt = -1

        # Calculate the word indices for the labeled word
        for idx, word in enumerate(words):
            char_cnt += len(word)+1
            if end <= char_cnt:
                end_word_index = idx
                break
            if start >= char_cnt:
                start_word_index = idx+1

        for idx in range(start_word_index, end_word_index + 1):
            if idx == start_word_index:
                labeled_text[idx] = 'B-' + label
            else:
                labeled_text[idx] = 'I-' + label

    return labeled_text

#apply the function to all training text
training_labels = []
for text, label_info in train:
    training_label = label_all_words(text, label_info)
    training_labels.append(list(zip(text.split(), training_label)))

testing_labels = []
for text, label_info in test:
    testing_label = label_all_words(text, label_info)
    testing_labels.append(list(zip(text.split(), testing_label)))

In [None]:
def format_data(data):
    dfs = []
    # Iterate through the training data and populate the DataFrame
    for text_data in data:
      tokens = [pair[0] for pair in text_data]
      entity = [pair[1] for pair in text_data]
      df = pd.DataFrame({'token': [tokens], 'entity': [entity]})
      dfs.append(df)

    return dfs

train_df = pd.concat(format_data(training_labels), ignore_index=True)
test_df = pd.concat(format_data(testing_labels), ignore_index=True)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

**Define unique labels and match labels with ids**

In [None]:
label_encoding_dict = {'I-PRG': 2,'I-I-MISC': 2, 'I-OR': 6, 'O': 0, 'I-': 0, 'VMISC': 0, 'B-PER': 3,  'I-PER': 4,  'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8, 'B-MISC': 1, 'I-MISC': 2}
label_list = ['O','B-MISC','I-MISC','B-PER','I-PER','B-ORG','I-ORG','B-LOC','I-LOC']

#Build Tokenizer and Model

In [None]:
#load model (using bert-base-NER)
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["token"]), padding='max_length', max_length=512, truncation=True, is_split_into_words=True)

    label_ids = []
    word_ids = tokenized_inputs.word_ids()
    label = examples["entity"]
    previous_word_idx = None
    for word_idx in word_ids:

            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == 'O':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx


    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs





train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

# Train Model

In [None]:
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", num_labels= 9)
batch_size = 10
args = TrainingArguments(
    f"test-{'ner'}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('un-ner.model')

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.190055,0.672251,0.565639,0.614354,0.941805
2,No log,0.168657,0.64565,0.712775,0.677554,0.950109
3,No log,0.161758,0.690784,0.706608,0.698606,0.954261


  _warn_prf(average, modifier, msg_start, len(result))


# Test Model

In [None]:


# Load your trained model
model = AutoModelForTokenClassification.from_pretrained("un-ner.model")

# Define your evaluation arguments
eval_args = TrainingArguments(
    f"test-ner",
    per_device_eval_batch_size=batch_size,
)

# Load the test dataset and tokenize it
test_dataset = test_tokenized_datasets

# Define the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load the "seqeval" metric
metric = load_metric("seqeval")

# Define the compute_metrics function as you have already done

# Create a Trainer for evaluation
eval_trainer = Trainer(
    model=model,
    args=eval_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the test dataset
eval_results = eval_trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results
print(eval_results)


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.16175831854343414, 'eval_precision': 0.6907838070628768, 'eval_recall': 0.7066079295154185, 'eval_f1': 0.6986062717770035, 'eval_accuracy': 0.9542608581032096, 'eval_runtime': 1.675, 'eval_samples_per_second': 23.283, 'eval_steps_per_second': 2.388}
