In [41]:
from transformers import AutoTokenizer
from datasets import load_dataset

tokeniser_cp = "t5-base"

In [20]:
tokeniser = AutoTokenizer.from_pretrained(tokeniser_cp)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [24]:
tokeniser("This is a test")

{'input_ids': [100, 19, 3, 9, 794, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [51]:
def tokenise_text(example):
    return tokeniser(example['text'])

def tokenise_glue(example):
    return {"premise_id":tokeniser(example['premise']),
            "hypothesis_id": tokeniser(example['hypothesis'])}

def get_premise(examples):
    # print(example)
    return [ex['premise'] for ex in examples]


In [5]:
test_set = "D:\\gitFolders\\pytorch_hardway\\data\\yahoo_answers_csv\\test.csv"
train_set = "D:\\gitFolders\\pytorch_hardway\\data\\yahoo_answers_csv\\train.csv"

In [None]:
yahoo_ds = load_dataset("csv", column_names=['label', 'question', 'text'],
                        data_files={"train":train_set})

In [32]:
glue_ds = load_dataset('glue', 'ax')
glue_ds

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/80.8k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/1104 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 1104
    })
})

In [36]:
glue_tokenise = glue_ds.map(tokenise_glue, remove_columns=['idx', 'premise', 'hypothesis'])

Map:   0%|          | 0/1104 [00:00<?, ? examples/s]

In [38]:
glue_tokenise['test'][1]

{'label': -1,
 'premise_id': {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [37, 1712, 410, 59, 2561, 30, 8, 6928, 5, 1]},
 'hypothesis_id': {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [37, 1712, 3, 7, 144, 30, 8, 6928, 5, 1]}}

In [39]:
type(glue_ds)

datasets.dataset_dict.DatasetDict

In [53]:
from torch.utils.data import Dataset, DataLoader

glue_loader = DataLoader(glue_ds['test'],
                         collate_fn=get_premise,
                         batch_size=4)

In [54]:
glue_iter = iter(glue_loader)
next(glue_iter)

['The cat sat on the mat.',
 'The cat did not sit on the mat.',
 "When you've got no snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow.",
 "When you've got snow, it's really hard to learn a snow sport so we looked at all the different ways I could mimic being on snow without actually being on snow."]

In [55]:
new_tokeniser = tokeniser.train_new_from_iterator(glue_iter, vocab_size=20000)

In [56]:
new_tokeniser("Lets test now")

{'input_ids': [361, 109, 110, 104, 983, 901, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [57]:
new_tokeniser.tokenize("Lets test now")

['▁L', 'e', 't', 's', '▁test', '▁now']

In [64]:
# train on wordpiece
from tokenizers import normalizers, pre_tokenizers, models, processors, trainers, decoders, Tokenizer

my_tokenizer = Tokenizer(model=models.WordPiece(unk_token="[UNK]"))
my_tokenizer.normalizers = normalizers.BertNormalizer(lowercase=True)
my_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]","[PAD]","[CLS]","[SEP]","[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000,
                              special_tokens=special_tokens)
my_tokenizer.train_from_iterator(glue_iter, trainer=trainer)

In [66]:
cls_token_id = my_tokenizer.token_to_id("[CLS]")
sep_token_id = my_tokenizer.token_to_id("[SEP]")
my_tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id)
    ],
)

my_tokenizer.decoder = decoders.WordPiece(prefix="##")

In [76]:
test = my_tokenizer.encode("There has to be a bigger sentence")

In [77]:
test.tokens

['[CLS]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[SEP]']