In [1]:
!pip install datasets
!pip install transformers
!pip install seqeval
!pip install evaluate
!pip install accelerate -U
!pip install optuna

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [2]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import numpy as np
import evaluate
import seqeval

# Task 1
Convert the IOB data to the correct data structure for token classification in Huggingface

In [3]:
# Define the mapping from IOB tags to integers
id2label = {
    0: "O",
    1: "B-corporation",
    2: "I-corporation",
    3: "B-creative-work",
    4: "I-creative-work",
    5: "B-group",
    6: "I-group",
    7: "B-location",
    8: "I-location",
    9: "B-person",
    10: "I-person",
    11: "B-product",
    12: "I-product",
}
label2id = {
    "O": 0,
    "B-corporation": 1,
    "I-corporation": 2,
    "B-creative-work": 3,
    "I-creative-work": 4,
    "B-group": 5,
    "I-group": 6,
    "B-location": 7,
    "I-location": 8,
    "B-person": 9,
    "I-person": 10,
    "B-product": 11,
    "I-product": 12,
}

# Define a function to read and parse your IOB data
def read_iob_file(file_path):
    full_file_path = f"{file_path}"
    try:
        with open(full_file_path, "r") as file:
            lines = file.read().splitlines()

        tokens = []
        ner_tags = []
        current_tokens = []
        current_ner_tags = []

        for line in lines:
            if line.strip() == "":
                tokens.append(current_tokens)
                ner_tags.append(current_ner_tags)
                current_tokens = []
                current_ner_tags = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_ner_tags.append(label2id[parts[1]])
                else:
                    print(f"Ignoring line: {line}")

        return {"tokens": tokens, "ner_tags": ner_tags}
    except FileNotFoundError:
        print(f"File not found: {full_file_path}")
        return None

# Paths to IOB files
train_path = "wnut17train.conll"
dev_path = "emerging.dev.conll"
test_path = "emerging.test.annotated"

# Read and process the data
train_data = read_iob_file(train_path)
dev_data = read_iob_file(dev_path)
test_data = read_iob_file(test_path)

# Create a DatasetDict with the train, validation, and test splits
custom_datasets = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'validation': Dataset.from_dict(dev_data),
    'test': Dataset.from_dict(test_data),
})

# Display the custom dataset
print(custom_datasets)

Ignoring line: ﻿
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1008
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1287
    })
})


## Some statistics from dataset:

In [4]:
# Example of one sentence
words, labels = train_data["tokens"][0], train_data["ner_tags"][0]
line1, line2 = "", ""
for word, label in zip(words, labels):
    full_label = id2label[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(f"\nExample of one sentence:")
print(f"\n{line1}\n{line2}")

count_tags_train = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0,
    5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0}

for i in range(len(train_data['tokens'])):
  for tag in train_data['ner_tags'][i]:
    count_tags_train[tag] += 1

print("\nCount of each tag in train dataset:")
for tag in count_tags_train.keys():
  print(id2label[tag]+ ": " + str(count_tags_train[tag]))

count_tags_test = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0,
    5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0}

for i in range(len(test_data['tokens'])):
  for tag in test_data['ner_tags'][i]:
    count_tags_test[tag] += 1

print("\nCount of each tag in test dataset:")
for tag in count_tags_test.keys():
  print(id2label[tag]+ ": " + str(count_tags_test[tag]))

count_tags_val = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0,
    5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0}

for i in range(len(dev_data['tokens'])):
  for tag in dev_data['ner_tags'][i]:
    count_tags_val[tag] += 1

print("\nCount of each tag in validation dataset:")
for tag in count_tags_val.keys():
  print(id2label[tag]+ ": " + str(count_tags_val[tag]))


Example of one sentence:

@paulwalk It 's the view from where I 'm living for two weeks . Empire     State      Building   = ESB        . Pretty bad storm here last evening . 
O         O  O  O   O    O    O     O O  O      O   O   O     O B-location I-location I-location O B-location O O      O   O     O    O    O       O 

Count of each tag in train dataset:
O: 59570
B-corporation: 221
I-corporation: 46
B-creative-work: 140
I-creative-work: 206
B-group: 264
I-group: 150
B-location: 548
I-location: 245
B-person: 660
I-person: 335
B-product: 142
I-product: 203

Count of each tag in test dataset:
O: 21654
B-corporation: 66
I-corporation: 22
B-creative-work: 142
I-creative-work: 218
B-group: 165
I-group: 70
B-location: 150
I-location: 94
B-person: 429
I-person: 131
B-product: 127
I-product: 126

Count of each tag in validation dataset:
O: 14475
B-corporation: 33
I-corporation: 11
B-creative-work: 105
I-creative-work: 133
B-group: 39
I-group: 25
B-location: 74
I-location: 33
B-person: 47

## Alignment and tokenization:

In [5]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_datasets = custom_datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1008 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

# Task 2: Set up evaluation

In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [9]:
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

# Task 3: Baseline results
Fine-tune a model with the default hyperparameter settings on the train set and evaluate the model on the test set.

In [10]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

print(model.config.num_labels)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


13


In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
from transformers import TrainingArguments

# Fine-tune a model with the default hyperparameter settings
args = TrainingArguments(
    "baseline",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0,
    push_to_hub=False,
)

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.265891,0.516693,0.301205,0.380562,0.942165
2,0.174100,0.302756,0.608939,0.303058,0.404703,0.945499
3,0.057000,0.322028,0.566929,0.333642,0.42007,0.946909


TrainOutput(global_step=1275, training_loss=0.09779203452315985, metrics={'train_runtime': 180.7756, 'train_samples_per_second': 56.324, 'train_steps_per_second': 7.053, 'total_flos': 289506808910040.0, 'train_loss': 0.09779203452315985, 'epoch': 3.0})

# Task 4:  Hyperparameter optimization
Set up hyperparameter optimization with the AdamW optimizer
Try 3 different learning rates and 3 different values for batch_size.

In [14]:
# Handling the optimiation manually:
for learning_rate in [1e-6, 1e-4, 5e-5]:
  for batch_size in [8, 16]:
    args = TrainingArguments(
    "lr"+str(learning_rate)+"batchsize"+str(batch_size)+"output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    push_to_hub=True,
    )

    trainer = Trainer(
      model=model,
      args=args,
      train_dataset=tokenized_datasets["train"],
      eval_dataset=tokenized_datasets["validation"],
      data_collator=data_collator,
      compute_metrics=compute_metrics,
      tokenizer=tokenizer,
    )
    print("### Train and evaluation process for lr="+str(learning_rate)+" batch size="+str(batch_size)+"###")
    trainer.train()

### Train and evaluation process for lr=1e-06 batch size=8###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.293403,0.688928,0.469461,0.558405,0.95338
2,0.016800,0.293166,0.678511,0.48024,0.562412,0.954144
3,0.011400,0.29281,0.673877,0.48503,0.564067,0.954271


### Train and evaluation process for lr=1e-06 batch size=16###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.286111,0.655449,0.48982,0.560658,0.954271
2,No log,0.28772,0.657097,0.493413,0.563611,0.954334
3,0.015900,0.288135,0.653481,0.494611,0.563054,0.954398


### Train and evaluation process for lr=0.0001 batch size=8###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.336529,0.588976,0.447904,0.508844,0.949564
2,0.046400,0.347065,0.677903,0.433533,0.528853,0.950582
3,0.025500,0.362345,0.638365,0.486228,0.552005,0.95249


### Train and evaluation process for lr=0.0001 batch size=16###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.340988,0.644737,0.469461,0.543313,0.9516
2,No log,0.452683,0.647601,0.420359,0.509804,0.948292
3,0.014900,0.402796,0.633914,0.479042,0.545703,0.951154


### Train and evaluation process for lr=5e-05 batch size=8###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.466265,0.663366,0.401198,0.5,0.948738
2,0.007400,0.456837,0.709804,0.433533,0.53829,0.951472
3,0.005900,0.457037,0.685767,0.444311,0.539244,0.951409


### Train and evaluation process for lr=5e-05 batch size=16###


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.447802,0.590909,0.467066,0.521739,0.951345
2,No log,0.51076,0.676208,0.452695,0.542324,0.951218
3,0.003300,0.452752,0.66041,0.463473,0.544687,0.951727


In [16]:
# Loading the best model weights
best_args = TrainingArguments(
    "lr1e-06batchsize8output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-06,
    per_device_train_batch_size=8,
    push_to_hub=False,
    )

# Create and load the model with the best hyperparameters
best_model = AutoModelForTokenClassification.from_pretrained("lr1e-06batchsize8output")

# Create a Trainer for evaluation
best_trainer = Trainer(
    model=best_model,
    args=best_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Evaluate the optimized model on the test set
test_results = best_trainer.evaluate(tokenized_datasets["test"])

In [17]:
test_results

{'eval_loss': 0.3473083972930908,
 'eval_precision': 0.58,
 'eval_recall': 0.3493975903614458,
 'eval_f1': 0.4360902255639098,
 'eval_accuracy': 0.9478071300333419,
 'eval_runtime': 5.5249,
 'eval_samples_per_second': 232.945,
 'eval_steps_per_second': 29.141}

# Task 6: Extend evaluation
Extend the evaluation function so that it shows the Precision, Recall and F-score for each of the entity types (person, location, etc.) on the test set

In [21]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert predictions and labels to the correct format
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Calculate scores for each entity type
    detailed_score = classification_report(true_labels, true_predictions, output_dict=True)

    # Organize the scores by label
    per_label_scores = {label: {
                            "precision": score['precision'],
                            "recall": score['recall'],
                            "f1": score['f1-score'],
                            "number": score['support']
                          } for label, score in detailed_score.items() if label not in ['micro avg', 'macro avg', 'weighted avg']}

    # Return the metrics
    return  {
        "overall_precision": detailed_score['weighted avg']['precision'],
        "overall_recall": detailed_score['weighted avg']['recall'],
        "overall_f1": detailed_score['weighted avg']['f1-score'],
        "macro_f1": detailed_score['macro avg']['f1-score'],
        "micro_f1": detailed_score['micro avg']['f1-score'],
        "by_label": per_label_scores  # This is your per-label score
    }


In [27]:
# Create a Trainer for evaluation with new compute_metrics
best_trainer = Trainer(
    model=best_model,
    args=best_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Evaluating trainer on test data
eval_result = best_trainer.evaluate(tokenized_datasets["test"])

import pandas as pd

# Assume eval_result is the result from best_trainer.evaluate
per_label_scores = eval_result['eval_by_label']
micro_f1 = eval_result['eval_micro_f1']
macro_f1 = eval_result['eval_macro_f1']

# Create a DataFrame from the per-label scores
df = pd.DataFrame.from_dict(per_label_scores, orient='index')
df.reset_index(inplace=True)
df.columns = ['Label', 'Precision', 'Recall', 'F1', 'Support']

# Round the DataFrame's values for better readability
df = df.round(decimals=6)

# Print the DataFrame for per-label scores
print("Per-label scores:")
print(df.to_string(index=False))

# Print the micro and macro F1 scores
print(f"\nMicro F1: {micro_f1:.3f}")
print(f"Macro F1: {macro_f1:.3f}")


Trainer is attempting to log a value of "{'corporation': {'precision': 0.35185185185185186, 'recall': 0.2878787878787879, 'f1': 0.3166666666666667, 'number': 66}, 'creative-work': {'precision': 0.45, 'recall': 0.2535211267605634, 'f1': 0.3243243243243243, 'number': 142}, 'group': {'precision': 0.4852941176470588, 'recall': 0.2, 'f1': 0.2832618025751073, 'number': 165}, 'location': {'precision': 0.5384615384615384, 'recall': 0.4666666666666667, 'f1': 0.5, 'number': 150}, 'person': {'precision': 0.7695167286245354, 'recall': 0.4825174825174825, 'f1': 0.5931232091690545, 'number': 429}, 'product': {'precision': 0.24489795918367346, 'recall': 0.09448818897637795, 'f1': 0.13636363636363635, 'number': 127}}" of type <class 'dict'> for key "eval/by_label" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Per-label scores:
        Label  Precision   Recall       F1  Support
  corporation   0.351852 0.287879 0.316667       66
creative-work   0.450000 0.253521 0.324324      142
        group   0.485294 0.200000 0.283262      165
     location   0.538462 0.466667 0.500000      150
       person   0.769517 0.482517 0.593123      429
      product   0.244898 0.094488 0.136364      127

Micro F1: 0.436
Macro F1: 0.359
