In [1]:
from transformers import PreTrainedTokenizer, BatchEncoding, DataCollatorWithPadding

In [12]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
config = AutoConfig.from_pretrained("FacebookAI/xlm-roberta-base")
config.update({"num_labels": 2})
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-base", config=config)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
sentences = [["Hello, this one sentence!", "And this sentence goes with it."],
            [ "Hello2", "Goodbye2"],
]

model_inputs = []

for sentence in sentences:
    inputs = tokenizer(sentence[0], sentence[1],           add_special_tokens=True,
            padding="do_not_pad",
            truncation=False,)
    inputs["label"] = 0
    model_inputs.append(inputs)

model_inputs

[{'input_ids': [0, 35378, 4, 903, 1632, 149357, 38, 2, 2, 3493, 903, 149357, 60899, 678, 442, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0},
 {'input_ids': [0, 35378, 304, 2, 2, 18621, 1272, 13, 304, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': 0}]

In [21]:
tokenizer.decode([0, 35378, 4, 903, 1632, 149357, 38, 2, 2, 3493, 903, 149357, 60899, 678, 442, 5, 2])

'<s> Hello, this one sentence!</s></s> And this sentence goes with it.</s>'

In [29]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer,pad_to_multiple_of=8,padding=True, return_tensors="pt")

In [28]:
data_collator(model_inputs)

{'input_ids': tensor([[     0,  35378,      4,    903,   1632, 149357,     38,      2,      2,
           3493,    903, 149357,  60899,    678,    442,      5,      2,      1,
              1,      1,      1,      1,      1,      1],
        [     0,  35378,    304,      2,      2,  18621,   1272,     13,    304,
              2,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([0, 0])}

In [44]:
import importlib
import load_dataset
importlib.reload(load_dataset)

from load_dataset import get_dataloader

dataloader = get_dataloader(
    dataset_path="data/daily_dialog_validation.jsonl",
    tokenizer=tokenizer,
    batch_size=8,
    shuffle=False,
    max_length=tokenizer.model_max_length,
)

Loading dataset from data/daily_dialog_validation.jsonl
Loaded 1000 lines from data/daily_dialog_validation.jsonl


Preparing input:   0%|          | 0/1000 [00:00<?, ?it/s]



Prepared 4000 examples


In [45]:
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[ 0, 62, 12,  ...,  1,  1,  1],
        [ 0, 62, 12,  ...,  1,  1,  1],
        [ 0, 62, 12,  ...,  1,  1,  1],
        ...,
        [ 0, 62, 12,  ...,  1,  1,  1],
        [ 0, 62, 12,  ...,  1,  1,  1],
        [ 0, 62, 12,  ...,  1,  1,  1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 1, 1, 0, 0, 0])}


In [47]:
tokenizer.batch_decode(batch["input_ids"])

["<s> A: Good morning, sir. Is there a bank near here? B: There is one. 5 blocks away from here? A: Well, that's too far.Can you change some money for me?</s></s> Alright. I'm sorry, Miss. According to our file, this prescription has already been refilled twice.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [55]:
output = model(**batch)
output

SequenceClassifierOutput(loss=tensor(0.6162, grad_fn=<NllLossBackward0>), logits=tensor([[0.4571, 0.0656],
        [0.4512, 0.0647],
        [0.4466, 0.0674],
        [0.4483, 0.0670],
        [0.4504, 0.0729],
        [0.4503, 0.0769],
        [0.4490, 0.0773],
        [0.4492, 0.0760]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [57]:
import torch
s = torch.nn.functional.softmax(output.logits, dim=1)
s

tensor([[0.5966, 0.4034],
        [0.5954, 0.4046],
        [0.5937, 0.4063],
        [0.5942, 0.4058],
        [0.5933, 0.4067],
        [0.5923, 0.4077],
        [0.5919, 0.4081],
        [0.5922, 0.4078]], grad_fn=<SoftmaxBackward0>)

In [59]:
# [Label2] / [[label1] + label[2])
s[:, 1] 

tensor([0.4034, 0.4046, 0.4063, 0.4058, 0.4067, 0.4077, 0.4081, 0.4078],
       grad_fn=<SelectBackward0>)