In [1]:
import torch
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb')
tokenizer.pad_token = tokenizer.eos_token

tokenizer

GPT2TokenizerFast(name_or_path='lvwerra/gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [2]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('imdb')
dataset = concatenate_datasets(list((dataset.values())))
dataset = dataset.remove_columns(['label'])


def f(data):
    data = [i['text'] for i in data]

    data = tokenizer(data,
                     padding=True,
                     truncation=True,
                     max_length=50,
                     return_tensors='pt').to(device)

    data['labels'] = data['input_ids'].clone()
    select = data['labels'] == tokenizer.pad_token_id
    data['labels'][select] = -100

    return data


loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=4,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=f)

len(loader), next(iter(loader))

(25000,
 {'input_ids': tensor([[   40,   973,   284,  2342,   428,   319,  2035, 18804,   393, 47908,
            393, 21932,   897,  1141,   262,   530,  3931,   287,   262,  3095,
           4101,   338,   326,   616,  3397, 45794,   284,   883,  9619,    13,
            314,  1625,  1973,   340,  1811,  1661,   287,  2972,  3354,   290,
           1464,  1043,   340,  3223,    11, 13699,   290, 13899,    13,   314],
         [   40,   373,   523,  6568,   284,   766,   262,  3350,   287,   428,
           3807,   326,   314,   373,  3190,  6655,   379,   703,  3190, 11342,
          18494,   428,  3807,   373,    13,   314,  1842,  1757, 20687,   432,
            475,   314,   423,   645,  2126,   644, 17273,   683,   284,   307,
            257,   636,   286,   428,  1291,  4223,    88,    13,   383, 31752],
         [   40,   460,   470,  1037,   475,  6044,   326,  8082,  3715,   287,
          17610,    11,   618,   262,  3131, 35609, 11173,   503,   286,   530,
            286,

In [3]:
from transformers import AutoModelForCausalLM

model_actor = AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb').to(
    device)

model_actor.config

GPT2Config {
  "_name_or_path": "lvwerra/gpt2-imdb",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.43.3",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
optimizer = torch.optim.Adam(model_actor.parameters(), lr=1e-5)

for i, data in enumerate(loader):
    out = model_actor(**data)
    out.loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 1000 == 0:
        print(i, len(loader), out.loss.item())
        
        prompt = data['input_ids'][0]
        chosen = prompt[5:]
        prompt = prompt[:5]

        gen = model_actor.generate(prompt.unsqueeze(0),
                                   max_length=32,
                                   pad_token_id=tokenizer.pad_token_id,
                                   eos_token_id=tokenizer.eos_token_id)[0, 5:]

        print('prompt=', tokenizer.decode(prompt))
        print('chosen=', tokenizer.decode(chosen))
        print('gen=', tokenizer.decode(gen))

model_actor.save_pretrained('model/actor')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0 25000 3.295994997024536
prompt= This film won't be
chosen=  to everyone's liking, but is certainly an all-time favorite of mine. Only a film like this can combine so many great elements into one entertaining movie.<br /><br />Kathleen Turner is just purely brilliant
gen=  the same as the one that won the Academy Award for Best Picture. It's a very different film. It's a very different film
1000 25000 3.203411102294922
prompt= Anyone who has seen this
chosen=  movie and reviewed it poorly, I would refer them to Roger Ebert's review of this movie. He is one of the most respected Critics in the industry, and he gave it 3 1/2 Stars.<br /><
gen=  movie will know that it is a very good movie. It is a very good movie. It is a very good movie. It is
2000 25000 3.435523271560669
prompt= I gave it a 4
chosen= . It made me laugh and I really like Pamela's work for PETA.<br /><br />With the exception of Courtney Love, Tommy Lee and Pamela Anderson I didn't know much about the other people in thi

22000 25000 3.2404744625091553
prompt= Dark Harvest is a very
chosen=  low budget production made by a bunch of rank amateurs which manages to come off as a kind of semi-professional movie. Unfortunately the poor effects, wooden acting and unoriginal story makes this a very mediocre horror slasher at
gen=  good movie. It is a very good movie. It is a very good movie. It is a very good movie. It is a
23000 25000 3.954042911529541
prompt= Allison Anders motivation in
chosen=  making this film may be obscure, but I'm glad she made it. This, her debut feature, runs like a meandering stream through rivulets of teenage angst and single-mother frustration in small town New Mexico
gen=  this film is to make a movie about a woman who is a woman. She is a woman who is a woman who is a woman
24000 25000 2.8846395015716553
prompt= WOW, it's
chosen=  interesting to see the variety of reviews of this film. The ones who liked it must have been drugged or related to the filmmakers! This movie had good