In [1]:
import torch
from tqdm import trange
from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor
from functools import wraps, partial
from torch.nn.modules.sparse import Embedding
from torch.optim import Adam, SGD
import torch.nn as nn

model_name = "google-t5/t5-large"
# model_name = (
#     "allenai/unifiedqa-v2-t5-large-1363200"  # you can specify the model size here
# )
tokenizer = T5Tokenizer.from_pretrained(model_name)
DEVICE = 0
model_original = T5ForConditionalGeneration.from_pretrained(
    model_name, device_map=f"cuda:{DEVICE}")  #'auto')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import copy

# del model
model = model_original  # copy.deepcopy(model_original)


# %%
def DEFAULT_COMPUTE_BIAS(self, query_length, key_length, device=None):
    """Compute binned relative position bias"""
    if device is None:
        device = self.relative_attention_bias.weight.device
    context_position = torch.arange(query_length, dtype=torch.long, device=device)[
        :, None
    ]
    memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
    relative_position = (
        memory_position - context_position
    )  # shape (query_length, key_length)
    relative_position_bucket = self._relative_position_bucket(
        relative_position,  # shape (query_length, key_length)
        bidirectional=(not self.is_decoder),
        num_buckets=self.relative_attention_num_buckets,
        max_distance=self.relative_attention_max_distance,
    )
    values = self.relative_attention_bias(
        relative_position_bucket
    )  # shape (query_length, key_length, num_heads)
    values = values.permute([2, 0, 1]).unsqueeze(
        0
    )  # shape (1, num_heads, query_length, key_length)
    return values


# %%
import pickle

dataset_test = pickle.load(open("test_without_abcd.pkl", "rb"))
dataset_train = pickle.load(open("train_without_abcd.pkl", "rb"))

# %%
MODE = "new"  #'old'

# if hasattr(layer, 'EncDecAttention'):
#     layer.EncDecAttention.compute_bias = partial(
#         new_compute_bias, layer.EncDecAttention)

# %%
model.hf_device_map

# %%
import textwrap


def measure_unalike(arr, print_arr=False):
    n = len(arr)
    arr = pd.Series(arr).value_counts()
    if print_arr:
        print(arr)
    return 1 - ((arr / n) ** 2).sum()


question_to_do = 5
per_question = 20


def run_tokens(tokens):
    res = model.generate(tokens, max_new_tokens=MAX_ANSWER_LENGTH)
    return tokenizer.batch_decode(res, skip_special_tokens=True)


def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
    # print(torch.argwhere(input_ids[0]==2)[0,0]+2)
    res = model.generate(
        input_ids.to(DEVICE), **generator_args, max_new_tokens=MAX_ANSWER_LENGTH
    )
    return tokenizer.batch_decode(res, skip_special_tokens=True)

In [21]:
# %%
# %%
QUESTION_MAX_LENGTH = 76
MAX_ANSWER_LENGTH = 40


# %%
def check(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")[0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    original = input_ids.tolist()
    anchor = []
    for i in range(len(tokens)):
        if (i < len(tokens) - 2 and tokens[i] == "▁(" and tokens[i + 1] == "▁"
                and tokens[i + 2] == ")") or original[i] == 1:
            anchor.append(i)
    # 0 1 2 3 4
    for x in reversed(range(1, 5)):
        if anchor[x] - anchor[x - 1] < MAX_ANSWER_LENGTH:
            [
                original.insert(anchor[x], 0)
                for _ in range(MAX_ANSWER_LENGTH - (anchor[x] - anchor[x - 1]))
            ]
        else:
            print(f"Wrong size ANSWER: {anchor[x] - anchor[x - 1] }")
            return None
    if anchor[0] < QUESTION_MAX_LENGTH:
        [
            original.insert(anchor[0], 0)
            for _ in range(QUESTION_MAX_LENGTH - anchor[0])
        ]
    else:
        print(f"Wrong size QUESTION: {anchor[0]}")
        return None
    return torch.tensor(original).view(1, -1)


start_pos = QUESTION_MAX_LENGTH
leng = MAX_ANSWER_LENGTH
a = [start_pos + leng * 0, start_pos + leng * 1]
b = [start_pos + leng * 1, start_pos + leng * 2]
c = [start_pos + leng * 2, start_pos + leng * 3]
d = [start_pos + leng * 3, start_pos + leng * 4]
DEC = {"01": 0, "02": 1, "03": 2, "12": 3, "13": 4, "23": 5}
mot = [a, b, c, d]
six_mask_turn_off = torch.ones((237, 237, 16))
six_mask_turn_on = torch.zeros((6, 237, 237, 16))

for i in range(len(mot) - 1):
    for j in range(i + 1, len(mot)):
        x = mot[i]
        y = mot[j]
        # print(mask_turn_off_hyper_dimension[x][:, y][:].shape)
        # cal index in 6
        comb_index = DEC[f"{i}{j}"]
        # no distance, a very special distance
        six_mask_turn_on[comb_index,x[0]:x[1],y[0]:y[1],:] = 1
        six_mask_turn_on[comb_index,y[0]:y[1],x[0]:x[1],:] = 1
        six_mask_turn_off[x[0]:x[1], y[0]:y[1],:] = 0
        six_mask_turn_off[x[0]:x[1],y[0]:y[1],:] = 0


# %%
def new_compute_bias(self, query_length, key_length, device=None):
    """Compute binned relative position bias"""
    if device is None:
        device = self.relative_attention_bias.weight.device
    context_position = torch.arange(query_length,
                                    dtype=torch.long,
                                    device=device)[:, None]
    memory_position = torch.arange(key_length, dtype=torch.long,
                                   device=device)[None, :]

    relative_position = (memory_position - context_position
                         )  # shape (query_length, key_length)
    # implementation='simple'
    if self.is_decoder:
        pass
    else:
        context_position_new = context_position.clone()
        context_position_new[b] = context_position_new[a]
        context_position_new[c] = context_position_new[a]
        context_position_new[d] = context_position_new[a]
        context_position_new[-1] = context_position_new[a[0]] + leng
        memory_position_new = context_position_new.clone().view(1, -1)
        relative_position = (memory_position_new - context_position_new
                             )  # shape (query_length, key_length)
        for i in range(len(mot)):
            for j in range(len(mot)):
                if i!=j:
                    x = mot[i]
                    y = mot[j]
                    print('x:y',x,y)
                    print('after: a ->b: ',relative_position[a][:,b], MAX_ANSWER_LENGTH)
                    relative_position[a][:,b]=0
                    # relative_position[x][:,y]=relative_position[x][:,y]+1#MAX_ANSWER_LENGTH
                    print('after: a ->b: ',relative_position[a][:,b], MAX_ANSWER_LENGTH)
                    break
            break
    relative_position_bucket = self._relative_position_bucket(
        relative_position,  # shape (query_length, key_length)
        bidirectional=(not self.is_decoder),
        num_buckets=self.relative_attention_num_buckets,
        max_distance=self.relative_attention_max_distance,
    )
    implementation = "complicated1"  # "change_32"  # "complicated"
    if self.is_decoder:
        values = self.relative_attention_bias(relative_position_bucket)
    else:  # special algo
        if implementation == "complicated":
            values = self.relative_attention_bias(relative_position_bucket)
            device_of_values = values.device
            # six_extra_embedding_forward = [
            #     self.extra_dimension_embedding_forward[i](torch.tensor(
            #         [0])).view(1, 1, 16).to(device_of_values) for i in range(6)
            # ]
            # six_extra_embedding_backward = [
            #     self.extra_dimension_embedding_backward[i](torch.tensor(
            #         [0])).view(1, 1, 16).to(device_of_values) for i in range(6)
            # ]
            # print(six_extra_embedding_backward[0].shape)
            # const_hyper=torch.tensor([[2]*16]).view(1,1,16).to(device_of_values)
            # # print(const_hyper)
            # tmp = torch.zeros_like(values).to(device_of_values)
            # for i in range(6):
            #     tmp += (six_mask_turn_on[i].to(device_of_values) *
            #             const_hyper +
            #             six_mask_turn_on_backward[i].to(device_of_values) *
            #             const_hyper)
            # values = values * six_mask_turn_off.to(device_of_values) + tmp
        elif implementation == "change_32":
            for i, x in enumerate(mot):
                for j, y in enumerate(mot):
                    if i != j:
                        relative_position_bucket[x, y] = 31  # furthest
            values = self.relative_attention_bias(relative_position_bucket)
        else:
            values = self.relative_attention_bias(relative_position_bucket)

    values = values.permute([2, 0, 1]).unsqueeze(
        0)  # shape (1, num_heads, query_length, key_length)
    return values


extra_dim_learning = []


def set_mode(MODE):
    for part in ["encoder"]:  # , 'decoder']:
        for block in getattr(model, part).block:
            for layer in block.layer:
                # only need to deal in the Encoder level
                if (hasattr(layer, "SelfAttention")
                        and layer.SelfAttention.has_relative_attention_bias):
                    itself = layer.SelfAttention
                    if MODE == "new":
                        itself.compute_bias = partial(new_compute_bias,
                                                      layer.SelfAttention)
                        tmp_extra_dim_learning = [[], []]
                        for i in range(2):
                            for j in range(6):
                                new_emb = Embedding(1, 16)
                                new_emb.weight.data.normal_(mean=0.0,
                                                            std=1024**-0.5)
                                tmp_extra_dim_learning[i].append(new_emb)

                        itself.extra_dimension_embedding_forward = nn.ModuleList(
                            tmp_extra_dim_learning[0])
                        itself.extra_dimension_embedding_backward = nn.ModuleList(
                            tmp_extra_dim_learning[1])
                    else:
                        itself.compute_bias = partial(DEFAULT_COMPUTE_BIAS,
                                                      layer.SelfAttention)


print(textwrap.fill(dataset_train[0][0]))
# set_mode("old")
# print("old ", run_tokens(check(dataset_train[0][0]).to(DEVICE)))
set_mode("new")
print("new ", run_tokens(check(dataset_train[0][0]).to(DEVICE)))

The sun is responsible for \n ( ) puppies learning new tricks ( )
children growing up and getting old ( ) flowers wilting in a vase ( )
plants sprouting, blooming and wilting
x:y tensor([ 76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
         90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115]) tensor([116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155])
after: a ->b:  tensor([[  0,   1,   2,  ...,  37,  38,  39],
        [ -1,   0,   1,  ...,  36,  37,  38],
        [ -2,  -1,   0,  ...,  35,  36,  37],
        ...,
        [-37, -36, -35,  ...,   0,   1,   2],
        [-38, -37, -36,  ...,  -1,   0,   1],
        [-39, -38, -37,  ...,  -2,  -1,   0]], device='cuda:2') 40
after: a ->b:  tensor([[  0,   1,   2,  ...,  

In [7]:
kk = [(index, x, y) for index, (x, y) in enumerate(model.named_parameters())
      if y.requires_grad == True]
[(index, x) for index, x, y in kk if "decoder" in x]
len(kk)
all_position_weight=[y for index, x, y in kk if ('extra_dimension_embedding' in x) or (('encoder' in x) and ('relative_attention_bias' in x))]

In [8]:
not_train = [
    "shared.weight",
    "encoder.block.0.layer.0.SelfAttention.q.weight",
    "encoder.block.0.layer.0.SelfAttention.k.weight",
    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
]
to_train_model = [(x, y) for index, x, y in kk]  # [:196]]
# to_train_model=to_train_model+
# to_train_model=[(x, y) for x, y in model.named_parameters() if x=="encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding.weight"]
# to_train=[]
to_train = [
    "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
    "encoder.block.0.layer.0.layer_norm.weight",
    "encoder.block.0.layer.1.DenseReluDense.wi.weight",
    "encoder.block.0.layer.1.DenseReluDense.wo.weight",
    "encoder.block.0.layer.1.layer_norm.weight",
    "encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding.weight",
]
# to_train_model=[(x,y) for x, y in model.named_parameters() if x in to_train]
# to_train_model = [(x, y) for x, y in model.named_parameters()
#                   if not x in not_train]

for y in model.parameters():
    y.requires_grad = False
for x, y in to_train_model:
    y.requires_grad = True
[x for x, y in to_train_model]

['shared.weight',
 'encoder.block.0.layer.0.SelfAttention.q.weight',
 'encoder.block.0.layer.0.SelfAttention.k.weight',
 'encoder.block.0.layer.0.SelfAttention.v.weight',
 'encoder.block.0.layer.0.SelfAttention.o.weight',
 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.0.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.1.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.2.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.3.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.4.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_forward.5.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_backward.0.weight',
 'encoder.block.0.layer.0.SelfAttention.extra_dimension_embedding_backward.1.weight',
 'encoder.block.0.layer

In [9]:
# for param in model.parameters():
#     param.requires_grad = False
# to_train_model=[y for x, y in model.named_parameters() if x in train_name_list ]
# for y in to_train_model:
#     y.requires_grad=True

In [10]:
from random_utils import set_seed

set_seed(42)


def shape(input):
    return input.view(1, -1)

In [11]:
# from transformers import AdamW, get_linear_schedule_with_warmup
# no_decay = ['layer_norm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in to_train_model], 'weight_decay': 0.0},
#     ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, eps=1e-8)
# scheduler =  get_linear_schedule_with_warmup(optimizer,
#                                 num_warmup_steps=0,
#                                 num_training_steps=100000)
to_train_model = [y for x, y in to_train_model]
# optimizer = SGD(to_train_model, lr=1e-2)

In [12]:
def get_model_forward(input_tokens):
    encoder_attentions = None
    last_hidden = None
    with torch.no_grad():
        start = [0]
        for k in range(MAX_ANSWER_LENGTH):
            result = model(
                input_ids=input_tokens.to(DEVICE),
                decoder_input_ids=torch.tensor([start]).to(DEVICE),
                output_attentions=True,
            )
            encoder_attentions = result.encoder_attentions
            last_hidden = result.encoder_last_hidden_state
            item = result.logits.argmax(dim=2)[0][-1].item()
            start.append(item)
            if item == 1:
                break
            # print(start)
    return (
        tokenizer.decode(start, skip_special_tokens=True),
        tokenizer.convert_ids_to_tokens(start),
        last_hidden,
        encoder_attentions,
    )

In [13]:
import numpy as np
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss(ignore_index=-100)


def get_loss(logits, labels):
    loss = torch.tensor(0)
    found = False
    for i in range(len(labels[0])):
        current_loss = loss_fn(logits[0][i], labels[0][i].to(DEVICE))

        current_certainly = torch.exp(-current_loss)
        if current_certainly < 0.9:
            loss = current_loss
            found = True
            break
    if not found:
        loss = loss_fn(logits[0], labels[0].to(DEVICE))
    return loss

In [14]:
if False:
    # pbar = trange(0, len(dataset_train), 24)
    # loss_score = 0
    # count = 0
    # extra_info = ""
    # step=0
    # # if count>20:
    # #     break
    # # print(textwrap.fill(dataset_train[0][0]))
    step = 0
    pbar = trange(200)
    for re in pbar:
        input_tokens = check(dataset_train[step][0])
        labels = tokenizer.encode(dataset_train[step][1], return_tensors="pt")
        result = model(input_ids=input_tokens.to(DEVICE),
                       labels=shape(labels).to(DEVICE))
        loss = get_loss(result.logits, labels)
        # print(result.logits.argmax(dim=2), labels)
        optimizer.zero_grad()
        # loss = result.loss
        # print(result.logits, labels, loss)
        if loss.item() != 0:
            loss_score = loss.item()  # loss_score * 0.9 + loss.item() * 0.1
            loss.backward()
        optimizer.step()
        # scheduler.step()
        # with torch.no_grad():
        #     mong= model(input_ids=check(dataset_train[0][0]).to(DEVICE), decoder_input_ids=torch.tensor([[0]]).to(DEVICE))
        #     print(mong.logits.argmax(dim=2).shape)
        # print(tokenizer.decode())

        extra_info = get_model_forward(
            check(dataset_train[step][0]).to(DEVICE))
        pbar.set_postfix_str(f"Loss: {loss_score:.10f}:{extra_info}")

In [15]:
data_array = [(k, v, l.split(" ( ) ")[1:])
              for l, k, v in [(dataset_train[x][0], check(dataset_train[x][0]),
                               dataset_train[x][1])
                              for x in range(0, len(dataset_train), 24)]
              if k is not None]

Wrong size QUESTION: 88


In [16]:
data_array[0]

(tensor([[   37,  1997,    19,  1966,    21,     3,     2,    29,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,    41,     3,    61, 26675,
           1036,   126, 13258,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,    41,     3,    61,   502,
           1710,    95,    11,   652,   

In [17]:
from torch.utils.data import Dataset, DataLoader


class CheckTransform(object):

    def __call__(self, sample):
        # print(f"'{sample[1]}'")
        return {
            "input_ids": sample[0][0],
            "label_index": sample[2].index(sample[1]),
            "all_labels": sample[2],
        }


class CustomDataset(Dataset):

    def __init__(self, dataset_array, transform=None):
        self.dataset = dataset_array
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.transform(self.dataset[idx])


def collate(datas):
    label_ids = tokenizer(sum([x["all_labels"] for x in datas], []),
                          padding=True)
    wrapper = label_ids
    wrapper["all_label_ids"] = torch.tensor(wrapper.pop("input_ids"))
    # wrapper["label_index"] = torch.tensor([x["label_index"] for x in datas])
    for k in wrapper["all_label_ids"]:
        k[k == tokenizer.pad_token_id] = -100
    wrapper["all_decoder_attention_masks"] = torch.tensor(
        wrapper.pop("attention_mask"))
    wrapper["input_ids"] = torch.stack([x["input_ids"] for x in datas])
    wrapper["label_index"] = torch.tensor([x["label_index"] for x in datas])
    return wrapper


loi_dataloader = DataLoader(
    CustomDataset(
        data_array,
        CheckTransform(),
    ),
    batch_size=10,
    shuffle=True,
    collate_fn=collate,
)
for k in loi_dataloader:
    print(k["all_label_ids"])
    break

tensor([[  394,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [ 2714,    49,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [10540,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [20394,     1,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [   71, 16092,   358,     1,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [  389,   625,     3,     7, 20895,  8735,    49,     1,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [  389,  1906, 16867,  1016,     1,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100],
        [   71,     3,    75,  362

In [18]:
# attention 898704
# hidden state 242688
# classification_layer = nn.Linear(242688, 4).to(DEVICE)
optimizer = Adafactor(
    to_train_model,  # + [x for x in classification_layer.parameters()],
    relative_step=True,
    warmup_init=True,
    lr=None,
)

In [19]:
def turn_position_learning(on):
    for x in all_position_weight:
        x.requires_grad=on
from tqdm import tqdm
from random_utils import set_seed

loss_running_score = 0
correct_running_score = 0
conform_running_score = 0
count = 0
extra_info = ""
res_tokens = []
accumulate = 10
optimizer.zero_grad()
set_seed(42)
turn_position=False
turn_position_learning(False)
for learn_pos in range(10):
    pbar = tqdm(loi_dataloader)
    for wrapper in pbar:
        count += 1
        # if count%20==0:
        #     turn_position=not turn_position
        #     turn_position_learning(turn_position)
        # if count>20:
        #     break
        # print(textwrap.fill(dataset_train[0][0]))
        only_correct_label_ids = torch.stack(
            [
                wrapper["all_label_ids"][batch_index * 4 + x]
                for batch_index, x in enumerate(wrapper["label_index"])
            ]
        )
        only_correct_decoder_attention_mask = torch.stack(
            [
                wrapper["all_decoder_attention_masks"][batch_index * 4 + x]
                for batch_index, x in enumerate(wrapper["label_index"])
            ]
        )
        result = model(
            input_ids=wrapper["input_ids"].to(DEVICE),
            labels=only_correct_label_ids.to(DEVICE),
            decoder_attention_mask=only_correct_decoder_attention_mask.to(
                DEVICE
            ),  # output_attentions=True
        )
        # conform_loss = 0
        # for batch in range(wrapper["input_ids"].shape[0]):
        #     selected_answer = result.logits[batch].argmax(dim=1)
        #     found = False
        #     conform_losses = [0, 0, 0, 0]
        #     for each_answer in range(4):
        #         tui_batch = wrapper["all_label_ids"][batch * 4 + each_answer]
        #         conform_losses[each_answer] += loss_fn(
        #                     result.logits[batch], tui_batch.to(DEVICE)
        #                 )
        #         # for m in range(len(tui_batch)):
        #         #     if selected_answer[m] != tui_batch[m] and tui_batch[m] != -100:
        #         #         conform_losses[each_answer] += loss_fn(
        #         #             result.logits[batch][m], tui_batch[m].to(DEVICE)
        #         #         )
        #         # conform_min_index = torch.argmin(conform_losses)
        #         # print(conform_min_index)
        #     conform_loss += min(conform_losses)  # conform_losses[conform_min_index]
        # conform_loss = conform_loss / wrapper["input_ids"].shape[0]
        # kk1=result.encoder_attentions
        # break
        # final_logits = classification_layer(
        #     torch.flatten(result.encoder_last_hidden_state, start_dim=1)
        # )
        # loss = loss_fn(final_logits, wrapper["label_index"].to(DEVICE))
        loss = result.loss
        loss_running_score = loss_running_score * 0.9 + loss.item() * 0.1
        if loss != 0:
            loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        # scheduler.step()
        with torch.no_grad():
            if count % 10 == 0:
                extra_info, res_tokens, _, _ = get_model_forward(
                    check(dataset_test[0][0]).to(DEVICE)
                )
                # final_logits = classification_layer(torch.flatten(hidden, start_dim=1))
                # extra_info = str(final_logits.argmax())
            pbar.set_description_str(
                f"Loss: {loss_running_score:.3f}"
            )
            pbar.set_postfix_str(extra_info)
pass

Loss: 0.557: 100%|██████████| 496/496 [08:31<00:00,  1.03s/it, quit eating lunch out]                                                                                            
Loss: 0.399: 100%|██████████| 496/496 [08:28<00:00,  1.02s/it, have lunch with friends]       
Loss: 0.180: 100%|██████████| 496/496 [08:39<00:00,  1.05s/it, buy less lunch]                      
Loss: 0.125:   1%|          | 5/496 [00:05<09:37,  1.18s/it, buy less lunch]


KeyboardInterrupt: 

In [None]:
# len_to_find=len(wrapper["label_ids"][0])-1
#         for batch in range(wrapper["input_ids"].shape[0]):
#             keys= result.logits[batch].argmax(dim=1)
#             found=False
#             tui_batch=wrapper["label_ids"][batch]
#             for m in range(len(tui_batch)):
#                 if keys[m]!=tui_batch[m]:
#                     if len_to_find>m:
#                         len_to_find=m
#                     break
#         for batch in range(wrapper["input_ids"].shape[0]):
#             loss+=loss_fn(result.logits[batch][:len_to_find+1],tui_batch[:len_to_find+1].to(DEVICE))
#         loss=loss/wrapper["input_ids"].shape[0]

In [26]:
data = dataset_test
count = 0
count1 = 0
count2 = 0
count10 = 0
pbar1 = trange(500)
for ques in pbar1:
    question = data[24 * ques][0]
    key = data[24 * ques][1]
    answer = get_model_forward(check(data[24 * ques][0]).to(DEVICE))[0]
    if key == answer:
        count += 1
    if key[0] == answer[0]:
        count1 += 1
    if key[:2] == answer[:2]:
        count2 += 1
    if answer in question:
        count10 += 1
    pbar1.set_postfix_str(f"{count}, {count1}, {count2}, {count10}")
    # else:
    #     pass
    # print(ques,':****',textwrap.fill(question))
    # print('Answer key',':****',key)
    # print('Answer ',answer)

# print("Count ", )

100%|██████████| 500/500 [01:56<00:00,  4.30it/s, 152, 302, 287, 312]


In [None]:
# pbar = trange(0, len(dataset_train), 24)
# loss_score = 0
# count = 0
# extra_info = ""
# set_seed(42)
# res_tokens=[]
# for learn_pos in range(10):
#     for step in pbar:
#         count += 1
#         # if count>20:
#         #     break
#         # print(textwrap.fill(dataset_train[0][0]))
#         input_tokens = check(dataset_train[step][0])
#         if input_tokens is None:
#             continue
#         labels = tokenizer.encode(dataset_train[step][1], return_tensors="pt")
#         result = model(input_ids=input_tokens.to(DEVICE), labels=shape(labels).to(DEVICE))

#         optimizer.zero_grad()
#         loss =loss_fn(result.logits[0][learn_pos],labels[0][learn_pos].to(DEVICE))
#         loss_score = loss_score * 0.9 + loss.item() * 0.1
#         if loss.item()!=0:
#             loss.backward()
#         optimizer.step()
#         # scheduler.step()
#         with torch.no_grad():
#             if count % 10 == 0:
#                 extra_info, res_tokens = get_model_forward(check(dataset_test[0][0]).to(DEVICE))
#             pbar.set_description_str(f"Loss: {loss_score:.2f}")
#             pbar.set_postfix_str(res_tokens[:learn_pos+2])
# pass

In [None]:
# ACCUMUTATE gradient
# total_params = to_train_model
# optimizer = SGD(total_params, lr=1e-2)
# pbar = trange(0, len(dataset_train), 24)
# loss_score = 0
# for step in pbar:
#     # count+=1
#     # if count>20:
#     #     break
#     # print(textwrap.fill(dataset_train[0][0]))
#     input_tokens = check(dataset_train[step][0])
#     if input_tokens is None:
#         continue
#     labels = tokenizer.encode(dataset_train[step][1], return_tensors="pt")
#     result = model(input_ids=input_tokens.to(DEVICE), labels=shape(labels).to(DEVICE))
#     optimizer.zero_grad()
#     loss = result.loss
#     loss_score = loss_score * 0.9 + loss.item() * 0.1
#     loss.backward()
#     optimizer.step()
#     with torch.no_grad():
#         pbar.set_postfix_str(
#             f"Loss: {loss_score:.2f}:'{run_tokens(check(dataset_test[0][0]).to(DEVICE))}'"
#         )
# pass

In [1]:
# %%

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
