In [1]:
import torch
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-160m')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer

GPTNeoXTokenizerFast(name_or_path='EleutherAI/pythia-160m', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("  

In [2]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('imdb')
dataset = concatenate_datasets([dataset[i] for i in ['train', 'test']])


def f(data):
    text = [i['text'] for i in data]
    label = [i['label'] for i in data]

    data = tokenizer(text,
                     padding=True,
                     truncation=True,
                     max_length=50,
                     return_tensors='pt').to(device)

    data['labels'] = torch.FloatTensor(label).to(device)

    return data


loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=16,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=f)

len(loader), next(iter(loader))

(3125,
 {'input_ids': tensor([[27682,   378,  2355,   254,  6114,   347,   399,  5021,    13,   247,
          18439,  2419, 27505,   342,   247,  8489, 33408,  5938,    13,   665,
           4850,   715,  7596,  1223,   703,   434,   562,   387,   253,  1980,
          28974,  2509,   690,  1390,  7017,  9449, 12701,    15,  2732, 15606,
            247, 44753,   660,  2040,  1070,  9239,  1020,  3877,   327,   247],
         [ 4943,   434,  1529,   273,   253, 16952,   434, 10439,    84,   326,
            309,  3698, 10793,   352,  3249,   327,   308,  5883,   390,   401,
           7722,    13,   984,  3738,   352,   778,   320, 33657, 24842,    13,
            352,   310,  6685, 25945,   285,   973, 14001,    13,  5043,   352,
            434,  1694, 44198,  7493,    13,   619,  7583, 12353,  1273,   760],
         [ 1147,  3133,   326, 34999,   556,   644,  2529,   347,   440,    14,
           6549,  2531,  1051,   533,   326,   434,   752,  5579,   310,    15,
           5579, 

In [3]:
from transformers import AutoModelForSequenceClassification

model_critic = AutoModelForSequenceClassification.from_pretrained(
    'EleutherAI/pythia-160m', num_labels=1).to(device)
model_critic.config.pad_token_id = tokenizer.pad_token_id

model_critic.config

Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at EleutherAI/pythia-160m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-160m",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 50277,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.43.3",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [4]:
optimizer = torch.optim.Adam(model_critic.parameters(), lr=1e-5)

for epoch in range(10):
    for i, data in enumerate(loader):
        out = model_critic(**data)
        out.loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 1000 == 0:
            logits = (out.logits > 0.5).squeeze(1).long()
            acc = (logits == data['labels'].long()).sum() / len(data['labels'])
            print(epoch, i, len(loader), out.loss.item(), acc.item())

model_critic.save_pretrained('model/critic')

0 0 3125 21.605430603027344 0.4375
0 1000 3125 0.18605837225914001 0.6875
0 2000 3125 0.1550552248954773 0.75
0 3000 3125 0.17063765227794647 0.6875
1 0 3125 0.049937088042497635 0.9375
1 1000 3125 0.08018088340759277 0.875
1 2000 3125 0.05100151151418686 0.9375
1 3000 3125 0.07913466542959213 0.875
2 0 3125 0.08428739011287689 0.9375
2 1000 3125 0.10260467976331711 0.875
2 2000 3125 0.043395109474658966 1.0
2 3000 3125 0.11563587933778763 0.875
3 0 3125 0.022636985406279564 1.0
3 1000 3125 0.03500146418809891 0.9375
3 2000 3125 0.07991153001785278 0.875
3 3000 3125 0.002385583706200123 1.0
4 0 3125 0.03368058800697327 0.9375
4 1000 3125 0.019572339951992035 1.0
4 2000 3125 0.021752173081040382 1.0
4 3000 3125 0.1174311414361 0.8125
5 0 3125 0.0034106860402971506 1.0
5 1000 3125 0.024997200816869736 1.0
5 2000 3125 0.07363016158342361 0.9375
5 3000 3125 0.011998754926025867 1.0
6 0 3125 0.008513586595654488 1.0
6 1000 3125 0.008864747360348701 1.0
6 2000 3125 0.008031263016164303 1.0
6