### Load dataset

In [None]:
!pip install evaluate



In [None]:
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk
nltk.download('treebank')

# load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print('Number of samples: ', len(tagged_sentences))

# save sentences and tags
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Number of samples:  3914


In [None]:
sentences[0]

['pierre',
 'vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'nov.',
 '29',
 '.']

### Preprocessing

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences,
    sentence_tags,
    test_size=0.3
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences,
    test_tags,
    test_size=0.5
)

In [None]:
# tokenization and modeling
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import Dataset

MAX_LEN = 256
model_name = 'QCRI/bert-base-multilingual-cased-pos-english'

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True,
)
model = AutoModelForTokenClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identic

In [None]:
class PosTagging_Dataset(Dataset):
    def __init__(self,
                 sentences: List[List[str]],
                 tags: List[List[str]],
                 tokenizer,
                 label2id,
                 max_len=MAX_LEN):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            'input_ids': self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            'labels': self.pad_and_truncate(labels, pad_id=self.label2id['O']),
            'attention_mask': self.pad_and_truncate(attention_mask, pad_id=0)
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return torch.as_tensor(padded_inputs)

In [None]:
# label2id, id2label
from collections import defaultdict

label2id = defaultdict(int, model.config.label2id)
id2label = {id: tag for tag, id in label2id.items()}
label2id

defaultdict(int,
            {'#': 7,
             '$': 6,
             "''": 5,
             ',': 2,
             '-LRB-': 17,
             '-RRB-': 32,
             '.': 4,
             ':': 3,
             'CC': 8,
             'CD': 9,
             'DT': 10,
             'EX': 11,
             'FW': 12,
             'IN': 13,
             'JJ': 14,
             'JJR': 15,
             'JJS': 16,
             'LS': 18,
             'MD': 19,
             'NN': 20,
             'NNP': 21,
             'NNPS': 22,
             'NNS': 23,
             'O': 0,
             'PDT': 24,
             'POS': 25,
             'PRP': 26,
             'PRP$': 27,
             'RB': 28,
             'RBR': 29,
             'RBS': 30,
             'RP': 31,
             'SYM': 33,
             'TO': 34,
             'UH': 35,
             'VB': 36,
             'VBD': 37,
             'VBG': 38,
             'VBN': 39,
             'VBP': 40,
             'VBZ': 41,
             'WDT': 42,
      

In [None]:
train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTagging_Dataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)

### Metric

In [None]:
accuracy = evaluate.load("accuracy")
ignore_label = len(label2id)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(
        predictions=predictions[mask].tolist(),
        references=labels[mask].tolist()
    )

### Trainer

In [None]:
from transformers import TrainingArguments, Trainer
import wandb
wandb.init(mode='disabled')

training_args = TrainingArguments(
    output_dir='out_dir',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.048083,0.986485
2,No log,0.039083,0.988661
3,0.148700,0.034849,0.989892
4,0.148700,0.033177,0.990431
5,0.148700,0.032277,0.99071
6,0.031400,0.031618,0.99085
7,0.031400,0.030979,0.991016
8,0.031400,0.03059,0.991169
9,0.025000,0.030565,0.991216
10,0.025000,0.030579,0.991263


TrainOutput(global_step=1720, training_loss=0.0625857719155245, metrics={'train_runtime': 1457.4068, 'train_samples_per_second': 18.794, 'train_steps_per_second': 1.18, 'total_flos': 3579882599208960.0, 'train_loss': 0.0625857719155245, 'epoch': 10.0})

### Inference

In [None]:
test_sentence = 'We are exploring the topic of deep learning'
input_sentence = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())])
input_sentence = input_sentence.to('cuda')

# predictions
outputs = model(input_sentence)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ''
for pred in preds:
    pred_tags += id2label[pred] + ' '
pred_tags

'PRP VBP RB DT NN IN JJ NN '