In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForTokenClassification
import pandas as pd
import numpy as np
import evaluate

In [1]:
pip install --upgrade "datasets<4.0.0" --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# we need compute metrics for evaluations
!pip install evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
rw_dt = load_dataset("conll2003", trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
rw_dt

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
rw_dt["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
# ner tags of the training
ner_train_features = rw_dt['train'].features['ner_tags'].feature.names
print(ner_train_features)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [None]:
# tokenizer and label aligner function for data
tokenizer=AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_and_align_labels(examples):
    tokenized_inputs= tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels=[]
    for i,label in enumerate(examples["ner_tags"]):
        word_ids=tokenized_inputs.word_ids(batch_index=i)
        label_ids=[]
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
        tokenized_inputs["labels"]=labels
    return tokenized_inputs


In [None]:
# mapping data using the function we wrote
rw_mapped= rw_dt.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
rw_mapped["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7270,
  22961,
  1528,
  1840,
  1106,
  21423,
  1418,
  2495,
  12913,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]}

In [None]:
# defining model
model=AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(ner_train_features))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
metric = evaluate.load("seqeval")
label_list = ner_train_features

def compute_metrics(eval_pred):
    logits, label_ids = eval_pred
    pred_ids = np.argmax(logits, axis=-1)
    true_tags = []
    pred_tags = []
    for true_seq, pred_seq in zip(label_ids, pred_ids):
        t_names = []
        p_names = []
        for t, p in zip(true_seq, pred_seq):
            if t == -100:
                continue
            t_names.append(label_list[t])
            p_names.append(label_list[p])
        true_tags.append(t_names)
        pred_tags.append(p_names)

    results = metric.compute(predictions=pred_tags, references=true_tags)
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }


In [None]:
training_args= TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,

)

In [None]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=rw_mapped["train"],
    eval_dataset=rw_mapped["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer=Trainer(


In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimankh7[0m ([33mimankh7-hofin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Step,Training Loss
500,0.2777
1000,0.1129
1500,0.0791
2000,0.0644
2500,0.045
3000,0.0457
3500,0.0379
4000,0.0235
4500,0.0208
5000,0.0226


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=5268, training_loss=0.07034242135064568, metrics={'train_runtime': 685.9774, 'train_samples_per_second': 61.406, 'train_steps_per_second': 7.68, 'total_flos': 920771584279074.0, 'train_loss': 0.07034242135064568, 'epoch': 3.0})

In [None]:
# time to check and evaluate test set
trainer.evaluate(rw_mapped["test"])

  return forward_call(*args, **kwargs)


{'eval_loss': 0.20438596606254578,
 'eval_precision': 0.8872982584115985,
 'eval_recall': 0.9027739122367566,
 'eval_f1': 0.8949691897360433,
 'eval_accuracy': 0.9700130788988512,
 'eval_runtime': 10.4375,
 'eval_samples_per_second': 330.826,
 'eval_steps_per_second': 41.389,
 'epoch': 3.0}