In [1]:
from transformers import AutoTokenizer
import random
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained('tokenizer/lvwerra/gpt2-imdb')
tokenizer.pad_token_id = 0

tokenizer

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


GPT2TokenizerFast(name_or_path='tokenizer/lvwerra/gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '!', 'additional_special_tokens': ['<|endoftext|>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
from datasets import load_from_disk, concatenate_datasets

dataset = load_from_disk('dataset/imdb')
dataset = concatenate_datasets([dataset[i] for i in ['train', 'test']])


def f(data):
    question = tokenizer.encode(data['text'], add_special_tokens=False)[:5]
    return {'question': question}


dataset = dataset.map(f, remove_columns=['label', 'text'])


def f(data):
    return len(data['question']) == 5


dataset = dataset.filter(f)

dataset, dataset[0]

(Dataset({
     features: ['question'],
     num_rows: 50000
 }),
 {'question': [40, 26399, 314, 3001, 327]})

In [3]:
def get_batch_data():
    label = random.choices(range(2), k=128)
    question = random.choices(dataset, k=128)
    question = [i['question'] for i in question]

    question = [[tokenizer.convert_tokens_to_ids(str(l))] + q
                for l, q in zip(label, question)]

    return label, question


get_batch_data()

([1,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0],
 [[16, 40, 423, 1775, 257, 1256],
  [16, 33, 18058, 286, 262, 4044],
  [15, 1, 464, 13037, 6289, 1],
  [15, 40, 460, 470, 3505, 618],
  [16, 1212, 318, 262, 5290, 286],
  [15, 40, 1816, 284, 766, 428],
  [15, 6109, 530, 815, 766, 428],
  [16, 1, 36091, 6592, 6711, 1],
  [15, 35700, 922, 2646, 422, 3771],
  [15, 36, 292, 813, 262, 5290],
  [15, 5211, 407, 76

In [4]:
class ModelPPO(torch.nn.Module):

    def __init__(self):
        super().__init__()
        from transformers import AutoModelForCausalLM

        self.model_gen = AutoModelForCausalLM.from_pretrained(
            'model/lvwerra/gpt2-imdb')

        self.v_head = torch.nn.Sequential(torch.nn.Dropout(0.1),
                                          torch.nn.Linear(768, 1))

        self.to(device)
        self.train()

    def forward(self, input_ids, attention_mask):
        last_hidden_state = self.model_gen.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True).last_hidden_state

        logits = self.model_gen.lm_head(last_hidden_state)
        value = self.v_head(last_hidden_state).squeeze(-1)

        return logits, value


model_ppo = ModelPPO()
model_ppo_ref = ModelPPO()

for i in model_ppo_ref.parameters():
    i.requires_grad_(False)

Some weights of the model checkpoint at model/lvwerra/gpt2-imdb were not used when initializing GPT2LMHeadModel: ['v_head.summary.weight', 'v_head.summary.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at model/lvwerra/gpt2-imdb were not used when initializing GPT2LMHeadModel: ['v_head.summary.weight', 'v_head.summary.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Bert

In [5]:
def get_kl(a, b):
    method = 'kl'

    if method == 'kl':
        return a - b

    if method == 'abs':
        return (a - b).abs()

    if method == 'mse':
        return (a - b).square() * 0.5

    if method == 'full':
        return torch.nn.functional.kl_div(a,
                                          b,
                                          log_target=True,
                                          reduction='none')


get_kl(torch.randn(15), torch.zeros(15))

tensor([-0.5304, -0.1680, -0.8149, -0.8351, -1.1403, -0.5243,  1.0303, -0.1843,
         0.0434,  0.9319, -0.2327,  0.5249,  0.8442, -0.8438,  0.8771])

In [6]:
from trl.core import clip_by_value, logprobs_from_logits, masked_mean, masked_whiten


class PPOTrainer:

    def __init__(self):
        self.optimizer = torch.optim.Adam(model_ppo.parameters(), lr=1e-5)

    def step(self, question, answer, reward):
        with torch.no_grad():
            #编码
            token = [q.tolist() + a.tolist() for q, a in zip(question, answer)]
            token = [{'input_ids': i} for i in token]
            token = tokenizer.pad(token, return_tensors='pt').to(device)
            input_ids = token.input_ids
            attention_mask = token.attention_mask
            del token

            #question和answer不需要内容,只需要长度信息即可
            lens_q = [len(i) for i in question]
            lens_a = [len(i) for i in answer]
            del question
            del answer

            #根据question计算answer的概率,并计算每个动作的分数
            prob_log, value, mask = self.batched_forward_pass(
                model_ppo, input_ids, attention_mask, lens_q, lens_a)

            #使用ref模型计算概率,这是为了计算kl散度
            prob_log_ref, _, _ = self.batched_forward_pass(
                model_ppo_ref, input_ids, attention_mask, lens_q, lens_a)

            #计算两份概率的kl散度,并融入reward
            reward = self.compute_rewards(reward, prob_log, prob_log_ref, mask)

            #计算delta和target,用于计算loss
            value, delta, target = self.compute_advantages(value, reward, mask)

        #每批数据循环N次模型
        for _ in range(4):
            #每次算一个数据
            for i in range(len(input_ids)):
                #重新计算概率和value
                prob_log_new, value_new, _ = self.batched_forward_pass(
                    model_ppo, input_ids[i].unsqueeze(0),
                    attention_mask[i].unsqueeze(0), [lens_q[i]], [lens_a[i]])

                #根据新旧概率求出变化率,进而求出loss
                #根据target和value的差可以计算出另外一份loss
                loss = self.get_loss(prob_log[i].unsqueeze(0),
                                     value[i].unsqueeze(0), prob_log_new,
                                     value_new, mask[i].unsqueeze(0),
                                     delta[i].unsqueeze(0),
                                     target[i].unsqueeze(0))

                if not loss:
                    continue

                loss.backward()
                #torch.nn.utils.clip_grad_norm_(model_ppo.parameters(), 1.0)
                self.optimizer.step()
                self.optimizer.zero_grad()

    def batched_forward_pass(self, model, input_ids, attention_mask, lens_q,
                             lens_a):
        logits, value = model(input_ids=input_ids,
                              attention_mask=attention_mask)

        #取每个字的概率对数
        prob_log = logprobs_from_logits(logits[:, :-1], input_ids[:, 1:])

        #是预测结果并且不是PAD的位置是1
        mask = torch.zeros_like(attention_mask)
        mask[:, :-1] = attention_mask[:, 1:]
        for i in range(len(input_ids)):
            start = lens_q[i] - 1
            end = start + lens_a[i]
            mask[i, :start] = 0
            mask[i, end:] = 0

        #对最后一个字的预测没有意义,直接丢弃
        value = value[:, :-1]
        mask = mask[:, :-1]

        return prob_log, value, mask

    def compute_rewards(self, reward, prob_log, prob_log_ref, mask):
        reward_kl = []

        for i in range(len(reward)):
            #求两份概率的kl散度
            kl = get_kl(prob_log[i], prob_log_ref[i]) * -0.2

            #把reward加在最后一个字的kl散度上
            if (mask[i] == 0).all():
                #print('all 0')
                idx = 0
            else:
                idx = mask[i].nonzero()[-1].item()
            kl[idx] += reward[i]

            reward_kl.append(kl)

        return torch.stack(reward_kl)

    def compute_advantages(self, value, reward_kl, mask):
        value = value * mask
        reward_kl = reward_kl * mask

        delta = []
        lens = reward_kl.shape[1]

        #从后往前遍历
        for i in reversed(range(lens)):
            #取下一时刻的value,如果已经是最后一个时刻,则value_next是0
            #因为整个循环是从后往前,所以第0次是0,其他时刻取value
            value_next = 0
            if i < lens - 1:
                value_next = value[:, i + 1]

            #value = gamma*下一时刻的value + reward
            #理论上相等,这里的差定义为delta,这里gamma是1,所以省略了
            d = reward_kl[:, i] + value_next - value[:, i]

            #取最后一个delta,如果还没有,则初始化为0
            last_d = 0
            if delta:
                last_d = delta[-1]

            #delta是从后往前传递的,这里的系数衡量了前后动作的因果关联性
            delta.append(d + 0.95 * last_d)

        #翻转顺序
        delta = torch.stack(delta[::-1]).transpose(0, 1)

        #定义target,它估计了理想的value值
        target = delta + value
        delta = masked_whiten(delta, mask)

        return value, delta, target

    def get_loss(self, prob_log, value, prob_log_new, value_new, mask, delta,
                 target):

        #对数概率,相除变相减,取exp后还原为商,即两个模型输出logits的变化率
        ratio = (prob_log_new - prob_log).exp()

        #如果变化率太过于剧烈,可能是发生了震荡,跳过
        if masked_mean(ratio, mask).item() > 10:
            #print('skip', masked_mean(ratio, mask).item())
            return None

        #先算两个value的loss,简单的算mse loss就可以了
        loss_vf1 = (value_new - target)**2
        #数值裁剪,很显然是为了缓解自举
        loss_vf2 = clip_by_value(value_new, value - 0.2, value + 0.2)
        loss_vf2 = (loss_vf2 - target)**2
        #两份loss取大的,还是为了缓解自举
        loss_vf = 0.5 * masked_mean(torch.max(loss_vf1, loss_vf2), mask)

        #计算ppo loss
        loss_surr1 = -delta * ratio
        #数值裁剪,很显然是为了缓解自举
        loss_surr2 = -delta * ratio.clamp(0.8, 1.2)
        loss_surr = masked_mean(torch.max(loss_surr1, loss_surr2), mask)

        return loss_surr + 0.1 * loss_vf


trainer = PPOTrainer()

trainer



<__main__.PPOTrainer at 0x7f04cdfc8190>

In [7]:
from transformers import AutoModelForSequenceClassification

tokenizer_cls = AutoTokenizer.from_pretrained(
    'tokenizer/lvwerra/distilbert-imdb')
model_cls = AutoModelForSequenceClassification.from_pretrained(
    'model/lvwerra/distilbert-imdb')
model_cls.to(device)

for i in model_cls.parameters():
    i.requires_grad_(False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def get_question():
    label, question = get_batch_data()
    label = torch.LongTensor(label).to(device)

    question = [torch.LongTensor(i).to(device) for i in question]

    return label, question


label, question = get_question()

label, question[:10]

(tensor([0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
         1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
         1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
         0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0'),
 [tensor([  15, 2215,  257, 3807,  588,  366], device='cuda:0'),
  tensor([   15, 17439, 10884, 26066,   750,   523], device='cuda:0'),
  tensor([  15, 1532,  345,  389, 2045,  329], device='cuda:0'),
  tensor([   15,    40, 13770,  8359,   428,  2646], device='cuda:0'),
  tensor([   16,  1722,   262, 13738,  8146, 17607], device='cuda:0'),
  tensor([  15, 1212, 2646,  815,  423,  587], device='cuda:0'),
  tensor([   16, 12298,    72,    47,  8404,   284], device='cuda:0'),
  tensor([   16, 42322,   314,   561, 18595,   661], device='cud

In [9]:
#包装类,用于生成
def generate(input_ids):
    return model_ppo.model_gen.generate(input_ids=input_ids,
                                        min_length=-1,
                                        top_k=0.0,
                                        top_p=1.0,
                                        do_sample=True,
                                        pad_token_id=tokenizer.pad_token_id,
                                        max_new_tokens=32,
                                        eos_token_id=tokenizer.eos_token_id)


def get_answer(question):
    #如果question的长度确定,这里可以转换成批运算
    if True:
        answer = generate(torch.stack(question))

        answer_new = []
        for i in answer:
            if tokenizer.eos_token_id not in i:
                answer_new.append(i.unsqueeze(0))
                continue
            split = i.tolist().index(tokenizer.eos_token_id) + 1
            answer_new.append(i[:split].unsqueeze(0))
        answer = answer_new
    else:
        answer = [generate(i.unsqueeze(0)) for i in question]

    #裁剪,只要生成的部分
    answer = [a[0, len(q):] for q, a in zip(question, answer)]

    return answer


answer = get_answer(question)

answer[:10]

[tensor([  464,  7193,   261,     1,   373,   287, 20550,    11,   612,   714,
           307,  2187,  3923,   546, 28623,   262,  2933,    11,   290,   703,
           339,  1392,  4978,   287,   262, 16479,   287,   262,  1903,   284,
          3095, 11445], device='cuda:0'),
 tensor([ 7138,    13,   679,   318,  4084,   379,   257,   966,   287,   465,
          1981,  3451,   810,   339,   338,   655,   407,   379,   477, 39072,
           355,   281,  6802, 29847,  1671,  1220,  6927,  1671, 11037, 48393,
          1279,  1671], device='cuda:0'),
 tensor([  262,  1388,  3435,   351,  3499, 19063,    11, 13300,  4306,  1100,
          1088,   329,   257, 39679,  3074,   475,   314,  4719,   340,   611,
           345,   765,   284,  5899, 20775,    13, 50256], device='cuda:0'),
 tensor([   13,  1081,  2582,   355, 17369,   465, 12702,  9546,   314,   373,
         21638,   585,     0, 50256], device='cuda:0'),
 tensor([ 2613,   459,     0,  1318,   338,   530,  7650,   414,  7411, 

In [10]:
def get_reward(question, answer, label):
    token = [q.tolist()[1:] + a.tolist() for q, a in zip(question, answer)]
    token = [tokenizer.decode(i) for i in token]

    token = tokenizer_cls(token,
                          padding=True,
                          truncation=True,
                          max_length=512,
                          return_tensors='pt').to(device)

    with torch.no_grad():
        logits = model_cls(**token).logits

    return logits.gather(1, label.reshape(-1, 1)).squeeze(1)


reward = get_reward(question, answer, label)

reward

tensor([-0.1023, -2.0593,  0.1034, -2.1943, -0.2854,  0.3657, -2.0234, -0.6897,
        -1.0600,  1.9237, -1.1278, -1.9625,  1.9247,  1.0706,  1.9642, -2.7512,
         0.1869,  0.4816, -1.4677,  1.4251,  1.8278, -2.4232,  0.3808, -2.2452,
        -1.4352, -1.3860,  1.2793,  2.5945,  0.8147,  1.0352, -0.8067,  1.5356,
        -0.4795,  0.6381, -1.9992,  1.6725,  1.0273, -1.3380,  2.6569,  0.8692,
         0.3785, -0.6434, -0.1132,  0.7105,  0.7786, -1.3073, -0.9017, -1.7745,
        -1.1298, -0.0379,  0.9818, -1.9865, -1.9988, -2.0698, -2.3039,  0.5009,
         0.7905, -1.5054, -0.3490, -1.1563,  0.6987,  1.6517, -1.2936, -1.1562,
        -0.6478, -1.3625, -0.8715, -1.3332, -1.3316, -2.4711,  0.1419, -0.8977,
         0.6781, -2.5032,  2.1797, -2.2050,  0.3033,  1.0208,  1.5972,  1.0242,
         0.0768,  1.0985,  2.1549,  1.7992, -2.0123,  2.7790,  0.7879, -0.4145,
         1.1283, -2.4656,  0.3194,  2.4502, -1.3679,  1.6667,  2.4101, -0.8367,
         0.9427,  2.0877, -2.4355, -0.80

In [11]:
for epoch in range(200):
    label, question = get_question()
    answer = get_answer(question)
    reward = get_reward(question, answer, label)

    trainer.step(question, answer, reward)

    if epoch % 10 == 0:
        print(epoch, reward.mean().item())
        question = tokenizer.decode(question[0].tolist())
        answer = tokenizer.decode(answer[0].tolist())
        reward = reward[0].item()

        #0差评,1好评
        print(question, '->', answer, reward)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


0 0.04424596205353737
0Rowan Atkinson's Mr -> . Incredible not starting off flat or unless you were just seriously down to sea out here I did feel sorry because there is an absolutely amazing heist film thats almost -1.7402509450912476


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


10 0.1893816739320755
1Not only is this a ->  work of pure creativity. Its worth buying Justin Kuehne.<|endoftext|> 2.4963550567626953


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


20 0.11868445575237274
0I found this very touching -> .)<|endoftext|> -1.8980457782745361


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


30 0.04239548742771149
0Even die hard John Wayne ->  was able to play that one scene apart by himself. <br /><br />The fight in the basement is a simple scene. That's the low- -0.8326716423034668


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


40 0.09275282919406891
0I couldn't wait to ->  find it on the prairie.<|endoftext|> -0.02781713753938675


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


50 0.38358354568481445
1The first episode set the ->  structure perfectly. It made me hate it.<|endoftext|> -0.7066333889961243


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


60 1.2206898927688599
1May 1938. Hitler in ->  this film is the best.<|endoftext|> 1.5236488580703735


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


70 1.730372428894043
1I disliked this film intensely ->  I wanted to finish it so often I didn't know what to do with the characters we never liked, but this film was amazing!<|endoftext|> 1.5757133960723877


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


80 1.9782875776290894
0This film is a very ->  bad framing.<|endoftext|> 2.4100003242492676


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


90 2.1086392402648926
1It is the early morning ->  in DC the best.<|endoftext|> 2.3762409687042236


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


100 2.1353683471679688
1It hurt to watch this -> , this, but it really felt ripping it into a piece and it makes it very well done.<|endoftext|> 2.6977579593658447


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

110 2.251354455947876
1This was the first of ->  such themes. It was great!<|endoftext|> 2.4420251846313477


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


120 2.341010093688965
1Everyone is surely familiar with ->  the name and is highly appreciated.<|endoftext|> 2.4222381114959717


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

130 2.313877582550049
1Whoever saddled this piece ->  is a great appreciated film.<|endoftext|> 2.3847947120666504


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


140 2.2891697883605957
1I can never fathom ->  this film blue shadows. I think it is wonderful.<|endoftext|> 2.648195743560791


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


150 2.1408190727233887
0Making a film for under ->  this crap!<|endoftext|> 2.4721293449401855


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


160 2.212196111679077
0Lets make a movie ->  about this crap.<|endoftext|> 2.4810938835144043


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


170 2.2233328819274902
0Just saw ICE AGE ->  for a total waste of time.<|endoftext|> 2.480029821395874


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


180 2.2824764251708984
0The apolitical musicians Eva ->  is just awful.<|endoftext|> 2.610349655151367


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


190 2.3515257835388184
1Excellent movie, a realistic ->  look of my life.<|endoftext|> 2.7795515060424805


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
