<a href="https://colab.research.google.com/github/linhlinhle997/ner-recognition/blob/features%2F01-pos-tagging/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate nltk

In [None]:
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
from collections import defaultdict

import nltk

## Load dataset

In [None]:
nltk.download("treebank")

# Load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples: ", len(tagged_sentences))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Number of samples:  3914


In [None]:
# Save sentences and tags
sentences, sentence_tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

print("Sentences: ", sentences[0])
print("Sentenes tag: ", sentence_tags[0])

Sentences:  ['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']
Sentenes tag:  ['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']


## Preprocessing

### Split dataset into train, val, test set

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags,
    test_size=0.3
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags,
    test_size=0.5
)

print("Train: ", len(train_sentences))
print("Val: ", len(valid_sentences))
print("Test: ", len(test_sentences))

Train:  2739
Val:  587
Test:  588


### Build dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name, user_fast=True)
model = AutoModelForTokenClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Create a label2id and id2label mapping
label2id = defaultdict(int, model.config.label2id)
id2label = {id: label for label, id in label2id.items()}

print(label2id)
print(id2label)

defaultdict(<class 'int'>, {'#': 7, '$': 6, "''": 5, ',': 2, '-LRB-': 17, '-RRB-': 32, '.': 4, ':': 3, 'CC': 8, 'CD': 9, 'DT': 10, 'EX': 11, 'FW': 12, 'IN': 13, 'JJ': 14, 'JJR': 15, 'JJS': 16, 'LS': 18, 'MD': 19, 'NN': 20, 'NNP': 21, 'NNPS': 22, 'NNS': 23, 'O': 0, 'PDT': 24, 'POS': 25, 'PRP': 26, 'PRP$': 27, 'RB': 28, 'RBR': 29, 'RBS': 30, 'RP': 31, 'SYM': 33, 'TO': 34, 'UH': 35, 'VB': 36, 'VBD': 37, 'VBG': 38, 'VBN': 39, 'VBP': 40, 'VBZ': 41, 'WDT': 42, 'WP': 43, 'WP$': 44, 'WRB': 45, '``': 1})
{7: '#', 6: '$', 5: "''", 2: ',', 17: '-LRB-', 32: '-RRB-', 4: '.', 3: ':', 8: 'CC', 9: 'CD', 10: 'DT', 11: 'EX', 12: 'FW', 13: 'IN', 14: 'JJ', 15: 'JJR', 16: 'JJS', 18: 'LS', 19: 'MD', 20: 'NN', 21: 'NNP', 22: 'NNPS', 23: 'NNS', 0: 'O', 24: 'PDT', 25: 'POS', 26: 'PRP', 27: 'PRP$', 28: 'RB', 29: 'RBR', 30: 'RBS', 31: 'RP', 33: 'SYM', 34: 'TO', 35: 'UH', 36: 'VB', 37: 'VBD', 38: 'VBG', 39: 'VBN', 40: 'VBP', 41: 'VBZ', 42: 'WDT', 43: 'WP', 44: 'WP$', 45: 'WRB', 1: '``'}


In [None]:
from torch.utils.data import Dataset

MAX_LEN = 256

class PosTaggingDataset(Dataset):
    def __init__(
        self,
        sentences: List[List[str]],
        tags: List[List[str]],
        tokenizer,
        label2id,
        max_len=MAX_LEN
    ):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id=label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            # If the sequence is shorter than the max length, pad it with pad_id
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            # If the sequence is longer than the max length, truncate it
            padded_inputs = inputs[:self.max_len]
        return torch.as_tensor(padded_inputs)

### Dataset loader

In [None]:
train_dataset = PosTaggingDataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTaggingDataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTaggingDataset(test_sentences, test_tags, tokenizer, label2id)

print("Train: ", len(train_dataset))
print("Val: ", len(val_dataset))
print("Test: ", len(test_dataset))

Train:  2739
Val:  587
Test:  588


## Metric

In [None]:
accuracy = evaluate.load("accuracy")

ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

### Trainer

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"  # Disable all external logging (wandb)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.050375,0.985985
2,No log,0.041559,0.988095
3,0.151000,0.036931,0.989113
4,0.151000,0.034618,0.989938
5,0.151000,0.033916,0.989938
6,0.031500,0.033034,0.990324
7,0.031500,0.032523,0.990524
8,0.031500,0.031939,0.990737
9,0.025500,0.032019,0.990783
10,0.025500,0.032126,0.990763


TrainOutput(global_step=1720, training_loss=0.06343442506568377, metrics={'train_runtime': 1632.621, 'train_samples_per_second': 16.777, 'train_steps_per_second': 1.054, 'total_flos': 3579882599208960.0, 'train_loss': 0.06343442506568377, 'epoch': 10.0})

## Inference

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenization
test_sentence = "We are exploring the topic of deep learning"
input_ids = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())]).to(device)

# Prediction
outputs = model(input_ids)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# Decode predictions
pred_tags = " ".join([id2label[pred] for pred in preds])

pred_tags # PRP VBP RB DT NN IN JJ NN

'PRP VBP RB DT NN IN JJ NN'