# Download the dataset & get the packeges

In [1]:
!pip install transformers datasets evaluate
from IPython.display import clear_output
clear_output()

In [2]:
!wget https://noisy-text.github.io/2017/files/wnut17train.conll
!wget https://noisy-text.github.io/2017/files/emerging.dev.conll
!wget https://noisy-text.github.io/2017/files/emerging.test.annotated
clear_output()

# Transofrm the data and convert to 🤗 Dataset

In [3]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            if len(line.split('\t'))  < 2:
                continue
            # print(line, len(line.split('\t')))
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)
    return token_docs, tag_docs



In [4]:
train_tokens, train_tags = read_wnut('wnut17train.conll')
val_tokens, val_tags = read_wnut('emerging.dev.conll')
test_tokens, test_tags = read_wnut('emerging.test.annotated')

In [5]:
# total_tags = train_tags + val_tags + test_tags

# unique_tags = set(tag for doc in total_tags for tag in doc)
# tag2id = {tag: id for id, tag in enumerate(unique_tags)}
tag2id = {
          'O': 0,
          'B-person': 1,
          'I-person': 2,
          'B-location': 3,
          'I-location': 4,
          'B-corporation': 5,
          'I-corporation': 6,
          'B-product': 7,
          'I-product': 8,
          'B-creative-work': 9,
          'I-creative-work': 10,
          'B-group': 11,
          'I-group': 12
          }

id2tag = {id: tag for tag, id in tag2id.items()}

train_tag_id = [[tag2id[tag] for tag in doc] for doc in train_tags]
val_tag_id = [[tag2id[tag] for tag in doc] for doc in val_tags]
test_tag_id = [[tag2id[tag] for tag in doc] for doc in test_tags]

In [7]:
from datasets import  Dataset, DatasetDict

train_data = {
    'tokens': train_tokens,
    'tag': train_tags,
    'tag_id': train_tag_id,
}

test_data = {
    'tokens': test_tokens,
    'tag': test_tags,
    'tag_id': test_tag_id,
}

validation_data = {
    'tokens': val_tokens,
    'tag': val_tags,
    'tag_id': val_tag_id,
}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(validation_data)
test_dataset = Dataset.from_dict(test_data)

whole_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [8]:
whole_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['tokens', 'tag', 'tag_id'],
        num_rows: 1287
    })
})

# Helper Functions

In [9]:
# labels in int
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["tag_id"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# Fine-tuning preparation

In [10]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [11]:
# inputs = tokenizer(whole_dataset["train"][0]["tokens"], is_split_into_words=True)
# print(inputs.tokens())

In [12]:
# print(inputs.word_ids())

In [13]:
# labels = whole_dataset["train"][0]["tag_id"]
# word_ids = inputs.word_ids()
# print(labels)
# print(align_labels_with_tokens(labels, word_ids))

In [14]:
tokenized_datasets = whole_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=whole_dataset["train"].column_names,
)

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer, return_tensors="tf"
)

In [16]:
# batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
# batch["labels"]

In [17]:
# for i in range(2):
#     print(tokenized_datasets["train"][i]["labels"])

# Baseline

In [18]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [37]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
# tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [39]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(output_dir="bert-finetuned-ner-noval", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    # validation_data=tf_eval_dataset,
    # callbacks=[callback],
    epochs=num_epochs,
)

/content/bert-finetuned-ner-noval is already a clone of https://huggingface.co/J1mb0o/bert-finetuned-ner-noval. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x7c2349219ba0>

In [25]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=c7f471255c8478dd9b6329590fe95c7ad1ba5935dc5e23d303cd9806e3110bdc
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [26]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [27]:
whole_dataset["train"]

Dataset({
    features: ['tokens', 'tag', 'tag_id'],
    num_rows: 3394
})

In [28]:
label_names = list(tag2id.keys())

In [29]:
whole_dataset["train"][1]["tag"]

['O',
 'O',
 'O',
 'O',
 'B-group',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [30]:
labels = whole_dataset["train"][0]["tag_id"]
labels = [label_names[i] for i in labels]
print(labels)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location', 'O', 'B-location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [31]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'location': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [40]:
import numpy as np

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])
metric.compute(predictions=[all_predictions], references=[all_labels])

{'corporation': {'precision': 0.19047619047619047,
  'recall': 0.18181818181818182,
  'f1': 0.18604651162790697,
  'number': 66},
 'creative-work': {'precision': 0.32857142857142857,
  'recall': 0.1619718309859155,
  'f1': 0.2169811320754717,
  'number': 142},
 'group': {'precision': 0.27941176470588236,
  'recall': 0.11515151515151516,
  'f1': 0.16309012875536483,
  'number': 165},
 'location': {'precision': 0.4745762711864407,
  'recall': 0.37333333333333335,
  'f1': 0.41791044776119407,
  'number': 150},
 'person': {'precision': 0.722007722007722,
  'recall': 0.4358974358974359,
  'f1': 0.5436046511627907,
  'number': 429},
 'product': {'precision': 0.1506849315068493,
  'recall': 0.08661417322834646,
  'f1': 0.11,
  'number': 127},
 'overall_precision': 0.4731182795698925,
 'overall_recall': 0.28544949026876737,
 'overall_f1': 0.3560693641618497,
 'overall_accuracy': 0.9331155778894472}

In [41]:
!pip install seqeval



In [50]:
from seqeval import metrics
# from sklearn.metrics import classification_report
print(metrics.classification_report([all_predictions], [all_labels]))

from sklearn.metrics import classification_report
# y_true = [0, 1, 2, 2, 2]
# y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(all_labels, all_predictions, target_names=label_names))



               precision    recall  f1-score   support

  corporation       0.18      0.19      0.19        63
creative-work       0.16      0.33      0.22        70
        group       0.12      0.28      0.16        68
     location       0.37      0.47      0.42       118
       person       0.44      0.72      0.54       259
      product       0.09      0.15      0.11        73

    micro avg       0.29      0.47      0.36       651
    macro avg       0.23      0.36      0.27       651
 weighted avg       0.30      0.47      0.36       651

                 precision    recall  f1-score   support

              O       0.36      0.27      0.31        66
       B-person       0.93      0.09      0.17       142
       I-person       0.44      0.11      0.17       165
     B-location       0.62      0.45      0.52       150
     I-location       0.84      0.46      0.59       429
  B-corporation       0.50      0.03      0.06       127
  I-corporation       0.28      0.11      0.16 

# Hyper parameter

We will experiment with 2 batch sizes `batch_size = [16,32]` and 3 learning rates `lr=[1e-5, 2e-5, 5e-5]`

Also finetuned for 3 epochs
https://datascience.stackexchange.com/a/97883


## Batch 32 lr 1e-5

In [51]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=1e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch32-lr1e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch32-lr1e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

  corporation       0.00      0.00      0.00         0
creative-work       0.00      0.00      0.00         1
        group       0.00      0.00      0.00         2
     location       0.15      0.31      0.21        74
       person       0.25      0.56      0.34       188
      product       0.00      0.00      0.00        26

    micro avg       0.12      0.44      0.19       291
    macro avg       0.07      0.15      0.09       291
 weighted avg       0.20      0.44      0.27       291



  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

              O       0.00      0.00      0.00        66
       B-person       0.00      0.00      0.00       142
       I-person       0.00      0.00      0.00       165
     B-location       0.50      0.08      0.14       150
     I-location       0.83      0.02      0.05       429
  B-corporation       0.00      0.00      0.00       127
  I-corporation       0.00      0.00      0.00       133
      B-product       0.00      0.00      0.00       442
      I-product       0.00      0.00      0.00       242
B-creative-work       0.32      0.13      0.18       237
I-creative-work       0.55      0.24      0.34       918
        B-group       0.69      0.09      0.16       359
        I-group       0.93      1.00      0.96     36390

       accuracy                           0.92     39800
      macro avg       0.29      0.12      0.14     39800
   weighted avg       0.88      0.92      0.89     39800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Barch 32 lr 3e-5

In [52]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch32-lr3e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch32-lr3e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
               precision    recall  f1-score   support

  corporation       0.05      0.04      0.04        69
creative-work       0.02      0.04      0.03        71
        group       0.08      0.19      0.12        73
     location       0.43      0.41      0.42       158
       person       0.45      0.63      0.52       303
      product       0.03      0.06      0.04        68

    micro avg       0.26      0.38      0.31       742
    macro avg       0.18      0.23      0.20       742
 weighted avg       0.29      0.38      0.33       742



  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

              O       0.21      0.20      0.20        66
       B-person       0.00      0.00      0.00       142
       I-person       0.41      0.10      0.16       165
     B-location       0.56      0.53      0.55       150
     I-location       0.74      0.48      0.59       429
  B-corporation       0.00      0.00      0.00       127
  I-corporation       0.21      0.05      0.07       133
      B-product       0.55      0.16      0.24       442
      I-product       0.33      0.08      0.13       242
B-creative-work       0.59      0.35      0.44       237
I-creative-work       0.81      0.30      0.43       918
        B-group       0.54      0.23      0.32       359
        I-group       0.94      1.00      0.97     36390

       accuracy                           0.93     39800
      macro avg       0.45      0.27      0.32     39800
   weighted avg       0.91      0.93      0.91     39800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##  Batch 32 lr 5e-5

In [53]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch32-lr5e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch32-lr5e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
               precision    recall  f1-score   support

  corporation       0.14      0.10      0.12        90
creative-work       0.04      0.10      0.06        62
        group       0.09      0.16      0.12        91
     location       0.43      0.45      0.44       142
       person       0.44      0.71      0.54       268
      product       0.02      0.04      0.02        52

    micro avg       0.26      0.40      0.32       705
    macro avg       0.19      0.26      0.22       705
 weighted avg       0.29      0.40      0.33       705

                 precision    recall  f1-score   support

              O       0.16      0.18      0.17        66
       B-person       1.00      0.06      0.11       142
       I-person       0.32      0.12      0.17       165
     B-location       0.59      0.49      0.53       150
     I-location       0.80      0.46      0.59       429
  B-corporation       0.33      0.03      0.06       127
  I-cor

## Batch 16 lr 1e-5

In [54]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=1e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch16-lr1e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch16-lr1e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
               precision    recall  f1-score   support

  corporation       0.00      0.00      0.00         2
creative-work       0.07      0.26      0.11        38
        group       0.00      0.00      0.00         7
     location       0.31      0.35      0.33       135
       person       0.41      0.65      0.50       270
      product       0.03      0.06      0.04        67

    micro avg       0.22      0.46      0.30       519
    macro avg       0.14      0.22      0.16       519
 weighted avg       0.30      0.46      0.36       519



  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

              O       1.00      0.02      0.03        66
       B-person       0.00      0.00      0.00       142
       I-person       0.50      0.01      0.01       165
     B-location       0.55      0.32      0.41       150
     I-location       0.79      0.35      0.49       429
  B-corporation       0.00      0.00      0.00       127
  I-corporation       0.00      0.00      0.00       133
      B-product       0.58      0.10      0.17       442
      I-product       0.14      0.00      0.01       242
B-creative-work       0.43      0.27      0.33       237
I-creative-work       0.70      0.30      0.42       918
        B-group       0.49      0.26      0.34       359
        I-group       0.94      1.00      0.97     36390

       accuracy                           0.93     39800
      macro avg       0.47      0.20      0.24     39800
   weighted avg       0.90      0.93      0.91     39800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Batch 16 lr 3e-5

In [55]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch16-lr3e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch16-lr3e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
               precision    recall  f1-score   support

  corporation       0.26      0.24      0.25        70
creative-work       0.22      0.39      0.28        80
        group       0.13      0.36      0.19        58
     location       0.45      0.54      0.49       125
       person       0.45      0.73      0.56       262
      product       0.07      0.11      0.09        84

    micro avg       0.31      0.50      0.38       679
    macro avg       0.26      0.39      0.31       679
 weighted avg       0.33      0.50      0.39       679

                 precision    recall  f1-score   support

              O       0.34      0.27      0.30        66
       B-person       0.72      0.27      0.40       142
       I-person       0.59      0.16      0.26       165
     B-location       0.71      0.53      0.60       150
     I-location       0.85      0.46      0.60       429
  B-corporation       0.44      0.14      0.21       127
  I-cor

## Batch 16 lr 5e-5

In [56]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_val_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2tag,
    label2id=tag2id,
)

num_epochs = 4
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


callback = PushToHubCallback(output_dir="bert-finetuned-batch16-lr5e-5", tokenizer=tokenizer)

model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    callbacks=[callback],
    epochs=num_epochs,
)

all_predictions = []
all_labels = []
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])


print(metrics.classification_report([all_predictions], [all_labels]))
print(classification_report(all_labels, all_predictions, target_names=label_names))

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Cloning https://huggingface.co/J1mb0o/bert-finetuned-batch16-lr5e-5 into local empty directory.


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
               precision    recall  f1-score   support

  corporation       0.20      0.20      0.20        64
creative-work       0.19      0.34      0.24        80
        group       0.18      0.32      0.23        93
     location       0.46      0.53      0.49       131
       person       0.43      0.67      0.53       276
      product       0.07      0.14      0.09        65

    micro avg       0.31      0.47      0.37       709
    macro avg       0.26      0.37      0.30       709
 weighted avg       0.32      0.47      0.38       709

                 precision    recall  f1-score   support

              O       0.36      0.24      0.29        66
       B-person       0.58      0.25      0.35       142
       I-person       0.52      0.21      0.30       165
     B-location       0.67      0.53      0.59       150
     I-location       0.80      0.46      0.59       429
  B-corporation       0.47      0.12      0.19       127
  I-cor