In [1]:
import torch
import random
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb')
tokenizer.pad_token = tokenizer.eos_token

tokenizer

GPT2TokenizerFast(name_or_path='lvwerra/gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [2]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('imdb')
dataset = concatenate_datasets(list(dataset.values()))
dataset = dataset.remove_columns(['label'])

dataset, dataset[0]

(Dataset({
     features: ['text'],
     num_rows: 100000
 }),
 {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered po

In [3]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('model/ppo').to(device)

model_actor.config

GPT2Config {
  "_name_or_path": "model/ppo",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.43.3",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
@torch.no_grad()
def get_data(label):
    b = 12
    len_question = 6
    len_answer = 32

    #====question====
    question = random.choices(dataset, k=b)
    question = [str(label) + ' ' + i['text'] for i in question]

    question = tokenizer(question,
                         padding=True,
                         truncation=True,
                         max_length=len_question,
                         return_tensors='pt').input_ids.to(device)

    #====answer====
    answer = model_actor.generate(input_ids=question,
                                  min_length=-1,
                                  max_length=len_question + len_answer,
                                  pad_token_id=tokenizer.pad_token_id,
                                  eos_token_id=tokenizer.eos_token_id,
                                  top_k=0.0,
                                  top_p=1.0,
                                  do_sample=True)

    answer = answer[:, question.shape[1]:]

    return question, answer


get_data(0)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


(tensor([[   15,  1318,   389, 14851,  4847,   284],
         [   15,   887,   691,   780,   340,   318],
         [   15, 33562,   530,   286,   616,   477],
         [   15,  3574,   262,  3670,   290,   644],
         [   15,  1081,  6678,    11,  8502, 29248],
         [   15,   632,   373,   845, 19827,  4964],
         [   15,   314,  1053,  1775,   477,  1478],
         [   15, 16805,    11,   314,  5465,   284],
         [   15,   770,   905,   318,  1107,  1049],
         [   15,   314,  8288, 13614,  3596, 29847],
         [   15,   383, 17366,  2460, 10674,   357],
         [   15, 24755,     0,   314,  1239,  4762]], device='cuda:0'),
 tensor([[  428,  2646,    11,   477,  5968,   287,   428,    13, 16699,  2279,
            546,   428,   318,  5543, 31906,    13,   632,  2331,   284,   423,
            587,   705,  8134,  2588,   425,     6,   422,   262,   717,  3807,
             11,   262],
         [  523,  2089,    11,   345,  9960,   287,   262,  2330,  8701,    13,


In [5]:
for q, a in zip(*get_data(0)):
    print(tokenizer.decode(q), '->', tokenizer.decode(a))
    print('==============')

0 Every once in a while -> , you're decided to immediately see "Stupid Time" and "Stupid" for a list of the worst movies I've ever seen. This is one
0 I am a fan of ->  the comics, and this is the worst movie I have ever seen. It is utterly bad. The story is awful and pointless. The acting is terrible. The
0 I picked this movie up ->  on Netflix and kept watching it. It seems to have only two things in common with all the other skits of the movie: that is it is the worst
0 This movie was released the ->  day before I saw this movie, and I couldn't stop watching. It was the worst movie I had ever seen, and the way it looked and felt bizarre
0 I saw what I believe ->  to be the worst movie I have ever seen. It claims to be the worst movie ever made. The acting in this movie is not terrible and the effects are
0 This is quite possibly the ->  worst movie I've ever seen. It's one of the worst movies I've ever seen. The writing is awful and the plot is poor even if they didn
0 In São Paulo,

In [6]:
for q, a in zip(*get_data(1)):
    print(tokenizer.decode(q), '->', tokenizer.decode(a))
    print('==============')

1 I am completely shocked that ->  this is so far in the horror genre, I think it is one of the best films that I have seen in my life. In this film, we see
1 This film created quite some ->  amazing moments and interesting gameplay of the characters. The music is amazing. The acting was excellent, giving the players an unforgettable feeling. A interesting movie, an exploration
1 The "confusion" ->  dates back to the Iron Man film. This movie really broke new franchises overnight and historically proved essential in most every film and in this movie. It is perfect in
1 I read where the producers ->  run of their own movies and gave TopShelf 2 stars. This is one of the best. It's the funniest and best. It shows how a
1 I should have walked out ->  and bought this movie. This is my favourite movie ever! Everyone watching is always welcome, and the film has amazing effects and costumes! The film is great for
1 STAR RATING: **** -> * This is a frigging wet nightmare. This is not just a mov