In [1]:
import torch
import random
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-160m')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = 'left'

tokenizer

GPTNeoXTokenizerFast(name_or_path='EleutherAI/pythia-160m', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("   

In [2]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('imdb')
dataset = concatenate_datasets(list(dataset.values()))
dataset = dataset.remove_columns(['label'])

dataset, dataset[0]

(Dataset({
     features: ['text'],
     num_rows: 100000
 }),
 {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered po

In [3]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('model/trl').to(device)

model_actor.config

GPTNeoXConfig {
  "_name_or_path": "model/ppo",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.43.3",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [4]:
#====question====
question = random.choices(dataset, k=12)
question = [i['text'] for i in question]

question = tokenizer(question,
                     padding=True,
                     truncation=True,
                     max_length=5,
                     return_tensors='pt').input_ids.to(device)

#====answer====
answer = model_actor.generate(input_ids=question,
                              min_length=-1,
                              max_length=50,
                              pad_token_id=tokenizer.pad_token_id,
                              eos_token_id=tokenizer.eos_token_id,
                              top_k=0.0,
                              top_p=1.0,
                              do_sample=True)
answer = answer[:, question.shape[1]:]

for q, a in zip(question, answer):
    print(tokenizer.decode(q), '->', tokenizer.decode(a))
    print('==============')

does anyone think that this ->  is one of the funniest movies that I have ever seen? This movie is definitely one of the best Disney movies I have ever seen.It is one of the best films that I have ever seen.<br /><br
Despite being centred around ->  the characters, the story is great. The photography is magnificent. I was surprised by the acting. The cinematography was well done and the plot was fun. <br /><br />The filming was great. This is
I sat with my children -> , and I must say this is one of the best films that I have seen. It is also one of the most exciting films I have ever seen.<br /><br />I highly recommend it.<br /><br />And
I have found this epic ->  movie a lot of fun. It is one of my favorites. I think it is one of the funniest movies I have seen, the actors are perfect, and the writing is marvelous. The story is riveting
This movie is a rare ->  gem. It's one of the best horror movies I've ever seen. It is one of the funniest films I've ever seen. It is one of the bes