In [None]:
!pip install evaluate

In [1]:
import torch
import numpy as np
import evaluate
import nltk

from typing import List
from sklearn.model_selection import train_test_split

nltk.download('treebank')

In [None]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples: ", len(tagged_sentences))

sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentences)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append(tag for tag in tags)

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3
)

valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.3
)

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

MAX_LEN = 256

class PosTagging_Dataset(Dataset):
    def __init__(self, sentences: List[List[str]],
                 tags: List[List[str]],
                 tokenizer,
                 label2id,
                 max_len=MAX_LEN):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.tokenizer = tokenizer
        self.label2id = label2id 
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]
        
        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]
        
        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "labels": self.pad_and_truncate(labels, pad_id=self.label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0)
        }
        
    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]

        return torch.as_tensor(padded_inputs)

In [None]:
train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)