# Reading in the data

In [1]:
def read_sent(path):

    """
    Reads a file containing sentences and returns a list of lists, where each inner list represents a sentence and contains the tokens (words or symbols) present in that sentence.

    Args:
    path (str): The path to the file containing the sentences.

    Returns:
    list: A list of lists, where each inner list represents a sentence and contains its tokens.
    """

    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[1])
    return(ents)

def read_labels(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[2])
    return(ents)

def read_index(path):
    ents = []
    curEnts = []
    for line in open(path):
        line = line.strip()
        if line == '':
            ents.append(curEnts)
            curEnts = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            curEnts.append(line.split('\t')[0])
    return(ents)

In [2]:
#Training data

#returns list of lists
training_labels = read_labels("baseline_data/en_ewt-ud-train.iob2")
training_sent = read_sent("baseline_data/en_ewt-ud-train.iob2")

#flatten to one list to be able to use myutils
train_flat_labels = sum(training_labels, [])
train_flat_sent = sum(training_sent, [])

In [3]:
#Evaluation data

dev_labels = read_labels("baseline_data/en_ewt-ud-dev.iob2")
dev_sent = read_sent("baseline_data/en_ewt-ud-dev.iob2")

dev_flat_labels = sum(dev_labels, [])
dev_flat_sent = sum(dev_sent, [])

In [4]:
#Test data
#Keeping track of indeces to save to required .iob2 format for model's predictions

test_labels = read_labels("baseline_data/en_ewt-ud-test.iob2")
test_sent = read_sent("baseline_data/en_ewt-ud-test.iob2")
test_index = read_index("baseline_data/en_ewt-ud-test.iob2")

test_flat_labels = sum(test_labels, [])
test_flat_sent = sum(test_sent, [])
test_flat_index = sum(test_index, [])

# tokenization

In [5]:
#!pip install transformers

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
inputs = []
for sentence in training_sent:
    inputt = tokenizer(sentence, is_split_into_words=True, padding = 'max_length', max_length = 512, truncation = True)
    inputs.append(inputt)

In [8]:
inputs_dev = []
for sentence in dev_sent:
    inputt = tokenizer(sentence, is_split_into_words=True, padding = 'max_length', max_length = 512, truncation = True)
    inputs_dev.append(inputt)

In [9]:
inputs_test = []
for sentence in test_sent:
    inputt = tokenizer(sentence, is_split_into_words=True, padding = 'max_length', max_length = 512, truncation = True)
    inputs_test.append(inputt)

In [10]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [11]:
def list_to_sentences(lst):
    """Converts list back to sentences
        Args:
        Returns:
    """
    sentences = []
    current_sentence = []
    
    for item in lst:
        if item == -100:
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
        else:
            current_sentence.append(item)
    
    if current_sentence:
        sentences.append(current_sentence)
    
    return sentences


In [12]:
import pandas as pd
label_all_tokens = True

In [13]:
def tokenize_and_align_labels(sentences, tags, tokenizer):
    tokenized_inputs = tokenizer(sentences, truncation=True, is_split_into_words=True, padding=True)

    aligned_labels = []
    for i, label in enumerate(pd.Series(tags)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs.data

In [14]:
id2label = {0: 'O',
            1: 'B-LOC', 
            2: 'I-LOC',
            3: 'B-PER',
            4: 'B-ORG',
            5: 'I-ORG',
            6: 'I-PER'
           }

In [15]:
label2id = {label: id for id, label in id2label.items()}

In [16]:
#convert labels from bio to numerical values
dev_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in dev_labels]
training_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in training_labels]
test_labels_num = [[label2id.get(label, label2id) for label in sublist] for sublist in test_labels]

In [17]:
tokenized_dev_data = tokenize_and_align_labels(dev_sent, dev_labels_num, tokenizer)

In [18]:
tokenized_training_data = tokenize_and_align_labels(training_sent, training_labels_num, tokenizer)

In [19]:
tokenized_test_data = tokenize_and_align_labels(test_sent, test_labels_num, tokenizer)

In [20]:
#!pip install datasets

In [21]:
from datasets import Dataset

In [22]:
train_dataset = Dataset.from_dict({
    'id': range(len(tokenized_training_data['input_ids'])),
    'input_ids': tokenized_training_data['input_ids'],
    'attention_mask': tokenized_training_data['attention_mask'],
    'labels': tokenized_training_data['labels']
})

dev_dataset = Dataset.from_dict({
    'id': range(len(tokenized_dev_data['input_ids'])),
    'input_ids': tokenized_dev_data['input_ids'],
    'attention_mask': tokenized_dev_data['attention_mask'],
    'labels': tokenized_dev_data['labels']
})

test_dataset = Dataset.from_dict({
    'id': range(len(tokenized_test_data['input_ids'])),
    'input_ids': tokenized_test_data['input_ids'],
    'attention_mask': tokenized_test_data['attention_mask'],
    'labels': tokenized_test_data['labels']
})

In [23]:
import evaluate
from datasets import load_metric

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [24]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [25]:
#id2label = {v: k for k, v in label2id.items()}

In [26]:
#id2label

In [27]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
label_names = list(label2id.keys())

In [29]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Convert labels to a list of lists if it's a set
    if isinstance(labels, set):
        labels = [labels]

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


Initializing the trainger(?)

In [30]:
#!pip install accelerate -U

In [31]:
from transformers import TrainingArguments
from transformers import DataCollatorForTokenClassification


args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = load_metric("seqeval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


# Training the model

In [32]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [34]:
import numpy as np

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.028,0.092444,0.76411,0.766885,0.765495,0.978742
2,0.0171,0.092623,0.76681,0.778504,0.772613,0.979075


In [None]:
model.save_pretrained('baseline')