In [8]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            if len(line.split('\t'))  < 2:
                continue
            # print(line, len(line.split('\t')))
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)
    return token_docs, tag_docs

train_tokens, train_tags = read_wnut('wnut17train.conll')
val_tokens, val_tags = read_wnut('emerging.dev.conll')
test_tokens, test_tags = read_wnut('emerging.test.annotated')

# total_tags = train_tags + val_tags + test_tags

# unique_tags = set(tag for doc in total_tags for tag in doc)
# tag2id = {tag: id for id, tag in enumerate(unique_tags)}
tag2id = {
          'O': 0,
          'B-person': 1,
          'I-person': 2,
          'B-location': 3,
          'I-location': 4,
          'B-corporation': 5,
          'I-corporation': 6,
          'B-product': 7,
          'I-product': 8,
          'B-creative-work': 9,
          'I-creative-work': 10,
          'B-group': 11,
          'I-group': 12
          }

id2tag = {id: tag for tag, id in tag2id.items()}

train_tag_id = [[tag2id[tag] for tag in doc] for doc in train_tags]
val_tag_id = [[tag2id[tag] for tag in doc] for doc in val_tags]
test_tag_id = [[tag2id[tag] for tag in doc] for doc in test_tags]

from datasets import  Dataset, DatasetDict

train_data = {
    'tokens': train_tokens,
    'tag': train_tags,
    'tag_id': train_tag_id,
}

test_data = {
    'tokens': test_tokens,
    'tag': test_tags,
    'tag_id': test_tag_id,
}

validation_data = {
    'tokens': val_tokens,
    'tag': val_tags,
    'tag_id': val_tag_id,
}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(validation_data)
test_dataset = Dataset.from_dict(test_data)

whole_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

whole_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 1287
    })
})

In [10]:
# labels in int
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tag_id"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "J1mb0o/bert-finetuned-ner-noval"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenized_datasets = whole_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=whole_dataset["train"].column_names,
)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")
label_names = list(tag2id.keys())

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

In [None]:
batch_size = ["16", "32"]
learning_rate = ["1e-5", "3e-5", "5e-5"]

for b in batch_size:
    for lr in learning_rate:
        model_checkpoint = f"J1mb0o/bert-finetuned-batch{b}-lr{lr}"
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        data_collator = DataCollatorForTokenClassification(
            tokenizer=tokenizer, return_tensors="tf"
        )
        model = TFAutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            id2label=id2tag,
            label2id=tag2id,
        )

        print(f"batch_size: {b}, learning_rate: {lr}")
        print(model_checkpoint)

        all_predictions = []
        all_labels = []
        for batch in tf_test_dataset:
            logits = model.predict_on_batch(batch)["logits"]
            labels = batch["labels"]
            predictions = np.argmax(logits, axis=-1)
            for prediction, label in zip(predictions, labels):
                for predicted_idx, label_idx in zip(prediction, label):
                    if label_idx == -100:
                        continue
                    all_predictions.append(label_names[predicted_idx])
                    all_labels.append(label_names[label_idx])
        print(metric.compute(predictions=[all_predictions], references=[all_labels]))

### Now that we found our best model we will dive a bit more into the evaluation

In [11]:
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
import numpy as np



model_checkpoint = f"J1mb0o/bert-finetuned-batch16-lr3e-5"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_datasets = whole_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=whole_dataset["train"].column_names,
)

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

print(model_checkpoint)
label_names = list(tag2id.keys())

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Some layers from the model checkpoint at J1mb0o/bert-finetuned-batch16-lr3e-5 were not used when initializing TFBertForTokenClassification: ['dropout_265']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at J1mb0o/bert-finetuned-batch16-lr3e

J1mb0o/bert-finetuned-batch16-lr3e-5


### Convert the labels to ids

In [12]:
# print(all_predictions[0])
# print(tag2id[all_predictions[0]])

all_predictions_id = [tag2id[pred] for pred in all_predictions]
all_labels_id = [tag2id[label] for label in all_labels]
class_id = list(tag2id.values())
# print(class_id)


In [59]:
from sklearn import metrics

def evaluate_predictions(y_pred, y_true, names_dict):
    recall = metrics.recall_score(y_true=y_true, y_pred=y_pred, average=None)
    precision = metrics.precision_score(y_true=y_true, y_pred=y_pred, average=None)

    f1 = metrics.f1_score(y_true=y_true, y_pred=y_pred, average=None)

    recall_micro = metrics.recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_macro = metrics.recall_score(y_true=y_true, y_pred=y_pred, average='macro')

    precision_micro = metrics.precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    precision_macro = metrics.precision_score(y_true=y_true, y_pred=y_pred, average='macro')

    f1_micro = metrics.f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro = metrics.f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    
    print(f"{'Label':<20} {'Precision':<20} {'Recall':<20} {'F1':<20}")
    for id, tag in zip( names_dict.values(), names_dict.keys()):
        print(f"{tag:<20} {precision[id]:<20.4f} {recall[id]:<20.4f} {f1[id]:<20.4f}")
    print('\n')
    print(f"{'micro':<20} {precision_micro:<20.4f} {recall_micro:<20.4f} {f1_micro:<20.4f}")
    print(f"{'macro':<20} {precision_macro:<20.4f} {recall_macro:<20.4f} {f1_macro:<20.4f}")


evaluate_predictions(all_predictions_id, all_labels_id, tag2id)


Label                Precision            Recall               F1                  
O                    0.9455               0.9964               0.9703              
B-person             0.8498               0.4615               0.5982              
I-person             0.8239               0.3007               0.4405              
B-location           0.7054               0.5267               0.6031              
I-location           0.7850               0.3544               0.4884              
B-corporation        0.3396               0.2727               0.3025              
I-corporation        0.3544               0.2105               0.2642              
B-product            0.4390               0.1417               0.2143              
I-product            0.5593               0.2758               0.3694              
B-creative-work      0.7222               0.2746               0.3980              
I-creative-work      0.6301               0.2081               0.3129       