# Imports

In [None]:
!pip install accelerate -U
!pip install transformers
!pip install datasets
!pip install seqeval 
import pandas as pd
from datasets import Dataset
from datasets import load_metric
import numpy as np
from transformers import TrainingArguments, DataCollatorForTokenClassification, Trainer, AutoModelForTokenClassificationAutoTokenizer

# Functions

In [1]:
def read_sent(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[1])
    return(ents)

def read_labels(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[2])
    return(ents)

def read_index(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[0])
    return(ents)

In [4]:
def column_to_list(df, column_name):
    """
    Convert a column in a DataFrame to a list of lists.

    Parameters:
    - df: DataFrame
        The DataFrame containing the column to be converted.
    - column_name: str
        The name of the column to be converted to a list.

    Returns:
    - lists: list
        A list of lists where each inner list corresponds to a row in the specified column.
    """
    column_values = df[column_name].tolist()
    lists = [list(arr) for arr in column_values]
    return lists


In [None]:
def tokenize_and_align_labels(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)

    aligned_labels = []
    for i, label in enumerate(pd.Series(tags)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs.data

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert labels to a list of lists if it's a set
    if isinstance(labels, set):
        labels = [labels]

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Training

In [6]:
id2label = {0: '0',
            1: 'B-PER', 
            2: 'I-PER',
            3: 'B-ORG',
            4: 'I-ORG',
            5: 'B-LOC',
            6: 'I-LOC'
           }

In [7]:
label2id = {label: id for id, label in id2label.items()}

In [9]:
#Training data
trainin_data = pd.read_parquet('data/train-english.parquet')
training_labels_num = column_to_list(trainin_data, 'ner_tags')
training_labels = [[id2label[label_id] for label_id in sequence] for sequence in training_labels_num]
training_sent =  column_to_list(trainin_data, 'tokens')

#flatten to one list to be able to use myutils
train_flat_labels = sum(training_labels, [])
train_flat_sent = sum(training_sent, [])

In [11]:
#test data
test_data = pd.read_parquet('data/test-english.parquet')
test_labels_num = column_to_list(test_data, 'ner_tags')
test_labels = [[id2label[label_id] for label_id in sequence] for sequence in test_labels_num]

test_sent =  column_to_list(test_data, 'tokens')
test_index = [[i for i, _ in enumerate(sublist)] for sublist in test_labels]

#flatten to one list to be able to use myutils
test_flat_labels = sum(test_labels, [])
test_flat_sent = sum(test_sent, [])
test_flat_index = sum(test_index, [])

In [13]:
#validation data
validation_data = pd.read_parquet('data/validation-english.parquet')
dev_labels_num = column_to_list(validation_data, 'ner_tags')
dev_labels = [[id2label[label_id] for label_id in sequence] for sequence in dev_labels_num]

dev_sent =  column_to_list(validation_data, 'tokens')

#flatten to one list to be able to use myutils
dev_flat_labels = sum(dev_labels, [])
dev_flat_sent = sum(dev_sent, [])

In [15]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [23]:
label_all_tokens = True

In [25]:
dev_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in dev_labels]
training_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in training_labels]
test_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in test_labels]

In [27]:
tokenized_dev_data = tokenize_and_align_labels(dev_sent, dev_labels_num, tokenizer)

In [28]:
tokenized_training_data = tokenize_and_align_labels(training_sent, training_labels_num, tokenizer)

In [29]:
tokenized_test_data = tokenize_and_align_labels(test_sent, test_labels_num, tokenizer)

In [38]:
train_dataset = Dataset.from_dict({
    'id': range(len(tokenized_training_data['input_ids'])),
    'input_ids': tokenized_training_data['input_ids'],
    'attention_mask': tokenized_training_data['attention_mask'],
    'labels': tokenized_training_data['labels']
})

dev_dataset = Dataset.from_dict({
    'id': range(len(tokenized_dev_data['input_ids'])),
    'input_ids': tokenized_dev_data['input_ids'],
    'attention_mask': tokenized_dev_data['attention_mask'],
    'labels': tokenized_dev_data['labels']
})

test_dataset = Dataset.from_dict({
    'id': range(len(tokenized_test_data['input_ids'])),
    'input_ids': tokenized_test_data['input_ids'],
    'attention_mask': tokenized_test_data['attention_mask'],
    'labels': tokenized_test_data['labels']
})

In [41]:
metric = load_metric("seqeval")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [43]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
label_names = list(label2id.keys())

In [50]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = load_metric("seqeval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [51]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [53]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.33,0.278731,0.820904,0.832729,0.826774,0.915561
2,0.2191,0.276202,0.837642,0.848885,0.843226,0.920944
3,0.1462,0.29202,0.835842,0.857946,0.84675,0.922233
4,0.1033,0.325498,0.842002,0.860157,0.850983,0.924214
5,0.0741,0.363941,0.847728,0.8577,0.852685,0.923784




TrainOutput(global_step=6250, training_loss=0.19308006744384765, metrics={'train_runtime': 3410.5799, 'train_samples_per_second': 29.321, 'train_steps_per_second': 1.833, 'total_flos': 2.61308561752064e+16, 'train_loss': 0.19308006744384765, 'epoch': 5.0})

In [73]:
model.save_pretrained('eng_base')

In [None]:
trainer.evaluate()

# Evaluation

In [81]:
model.to("cuda:0")

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [83]:
predictions, labels, metrics = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.8695652173913043,
  'recall': 0.9010294567322393,
  'f1': 0.8850177704360014,
  'number': 9811},
 'ORG': {'precision': 0.762987012987013,
  'recall': 0.7519518750799948,
  'f1': 0.7574292528846773,
  'number': 7813},
 'PER': {'precision': 0.8793546544831525,
  'recall': 0.9042725003670533,
  'f1': 0.8916395222584148,
  'number': 6811},
 '_': {'precision': 0.8886533665835411,
  'recall': 0.899079096757916,
  'f1': 0.8938358311908197,
  'number': 7927},
 'overall_precision': 0.8513529948312557,
 'overall_recall': 0.8652431864532476,
 'overall_f1': 0.8582418929687978,
 'overall_accuracy': 0.9273197231227548}

In [84]:
for i in range(len(true_predictions)):
    print("Example", i+1)
    print("Predicted:", true_predictions[i])
    print("Real:", true_labels[i])
    print()

Example 1
Predicted: ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'B-LOC', '0', '0', '0', '0', 'B-LOC', 'B-LOC', '0', '0', '0']
Real: ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'B-LOC', '0', '0', '0', '0', 'B-LOC', 'B-LOC', '0', '0', '0']

Example 2
Predicted: ['0', 'B-PER', 'B-PER', 'B-PER', 'I-PER', '0', 'B-PER', 'I-PER', 'I-PER', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', '0', '0', '0', '0', '0']
Real: ['0', 'B-PER', 'B-PER', 'B-PER', 'I-PER', '0', 'B-PER', 'I-PER', 'I-PER', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', '0', '0', '0', '0', '0']

Example 3
Predicted: ['B-ORG', 'B-ORG', 'I-ORG', 'I-ORG']
Real: ['B-ORG', 'B-ORG', 'I-ORG', 'I-ORG']

Example 4
Predicted: ['0', '0', 'B-LOC', 'B-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
Real: ['0', '0', 'B-LOC', 'B-LOC', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

Example 5
Pr

Example 5389
Predicted: ['0', '0', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', '0']
Real: ['0', '0', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', '0']

Example 5390
Predicted: ['0', '0', '0', 'B-LOC', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
Real: ['0', '0', '0', 'B-LOC', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

Example 5391
Predicted: ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC']
Real: ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC']

Example 5392
Predicted: ['0', '0', '0', '0', '0', '0', '0', 'B-PER', 'I-PER']
Real: ['0', '0', '0', '0', '0', '0', '0', 'B-PER', 'I-PER']

Example 5393
Predicted: ['B-PER', 'B-PER', 'I-PER', 'I-PER', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', '0', '0']
Real: ['B-PER', 'B-PER', 'I-PER', 'I-PER', '0', '0', '0', 'B-ORG', 'I-ORG', 'I-ORG', '0', '0']

Example 5394
Predicted: ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', '0', 'B-LOC', 'I-LOC']
Real: ['B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', '0', 'B-LOC', 'I-LOC