In [4]:
%pip install -q trl==0.8.3 transformers==4.45.2 datasets==2.21.0 peft==0.5.0 accelerate==0.28.0

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import transformers
import trl
import os
import datasets
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reward_model_name = "distilbert/distilbert-base-multilingual-cased"
reward_model_path = "./models/toxic_reward"

dataset_name = "AlexSham/Toxic_Russian_Comments"

model_name = "MTSAIR/Cotype-Nano"

In [3]:
data_original = datasets.load_dataset(dataset_name, split='train')
main_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [4]:
reward_tokenizer = transformers.AutoTokenizer.from_pretrained(reward_model_name)

In [5]:
def prompt_to_chat(prompt):
    messages = [
        #{"role": "system", "content": "Ты веселый собеседник и общаешься в чате."},
        {"role": "user", "content": prompt}
    ]
    text = main_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return text

In [3]:

ref_model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map=device)

In [5]:
inputs = main_tokenizer(prompt_to_chat("Привет."), return_tensors='pt').to(device)
generated_ids = ref_model.generate(**inputs, max_new_tokens=50, do_sample=True)
print("\nGenerated text:", main_tokenizer.decode(generated_ids.flatten().cpu().numpy().tolist()))

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)



Generated text: <|im_start|>user
Привет.<|im_end|>
<|im_start|>assistant
Здравствуйте! Как я могу вам помочь сегодня?<|im_end|>


In [6]:
# To train a reward model, you need a dataset (or generator) of positive-negative pairs.
# Each training sample should be a dict with 4 keys:
#  - input_ids_chosen, attention_mask_chosen = tokenizer("A sentence that human labeler likes more")
#  - input_ids_rejected, attention_mask_rejected = tokenizer("A sentence that human labeler likes less")



class PairwiseDataset(torch.utils.data.Dataset):
    """ A dataset of all possible pairs of chosen and texts in TRT reward training format """
    def __init__(self, data, tokenizer, accepted_label: int, max_number = None, label = 'label', text = 'text'):
        super().__init__()
        self.tokenizer = tokenizer
        
        self.chosen_texts = [row[text] for row in data if row[label] == accepted_label]
        self.rejected_texts = [row[text] for row in data if row[label] != accepted_label]
        if max_number is not None:
            self.chosen_texts = self.chosen_texts[:max_number]
            self.rejected_texts = self.rejected_texts[:max_number]
        self.column_names = ["input_ids_chosen","input_ids_rejected"]
        assert self.chosen_texts, f"no texts with label {accepted_label}"
        print(f"Found {len(self.chosen_texts)} chosen and {len(self.rejected_texts)} rejected texts, {len(self)} pairs")

    def __len__(self):
        return len(self.chosen_texts) * len(self.rejected_texts)  # all pairs

    def __getitem__(self, index: int):
        chosen = self.tokenizer(self.chosen_texts[index // len(self.chosen_texts)], truncation=True)
        rejected = self.tokenizer(self.rejected_texts[index % len(self.chosen_texts)], truncation=True)
        return dict(input_ids_chosen=chosen['input_ids'], attention_mask_chosen=chosen['attention_mask'],
                    input_ids_rejected=rejected['input_ids'], attention_mask_rejected=rejected['attention_mask'])

In [8]:
TARGET_LABEL = 1   # toxic
reward_data = PairwiseDataset(data_original, reward_tokenizer, accepted_label=TARGET_LABEL, max_number=12500)

NameError: name 'PairwiseDataset' is not defined

In [9]:
sample = reward_data[31337]
print('CHOSEN:', reward_tokenizer.decode(sample['input_ids_chosen']))
print('REJECTED:', reward_tokenizer.decode(sample['input_ids_rejected']))

CHOSEN: [CLS] на хуй, безликая [SEP]
REJECTED: [CLS] примите соболезнования от всей нашей семьи. [SEP]


In [6]:

def train_reward(reward_model, reward_tokenizer, reward_data):
    training_args = trl.RewardConfig(  # like transformers.TrainingArguments
        output_dir="reward_model",
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        learning_rate=1.41e-5,
        max_steps=1_000,              # note: training may need more than 1k steps
        logging_steps=50,
        gradient_checkpointing=True,  # reduce memory usage but train ~30% slower
        gradient_checkpointing_kwargs={"use_reentrant": False},
        fp16=True,                     # disable this on CPU or on very old GPUs
        # you may add any other hyperparameters that you found useful in weeks 5-7
        #report_to="none"
    )
    
    trainer = trl.RewardTrainer(
        model=reward_model,
        args=training_args,
        tokenizer=reward_tokenizer,
        train_dataset=reward_data,
        peft_config=None,  # optionally, you may tune with LoRA, prompt-tuning, etc
    )
    
    trainer.train()
    trainer.save_model(reward_model_path)


if not os.path.isdir(reward_model_path):
    reward_model = transformers.AutoModelForSequenceClassification.from_pretrained(reward_model_name, device_map=device)
    train_reward(reward_model, reward_tokenizer, reward_data)
else:
    print("loading pretrained reward_model")
    reward_model = transformers.AutoModelForSequenceClassification.from_pretrained(reward_model_path, device_map=device)

#c533837f4fb333a7a6371e4e3073ca3bafa0e9b3

loading pretrained reward_model


In [7]:
reward_model.gradient_checkpointing_disable()
reward_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:

for sample_index in 45, 16000:
  print('TEXT:', data_original[sample_index]['text'])
  inputs = reward_tokenizer(data_original[sample_index]['text'], truncation=True, return_tensors='pt').to(device)
  with torch.no_grad():
    reward = reward_model(**inputs).logits[0, 0].item()
    print("REWARD:", reward)
  print('LABEL:', data_original[sample_index]['label'])
  print()

# note: your reward model may produce different absolute rewards.
# This is fine as long as the rewards are ordered correctly (most of the time)

TEXT: хочу чтобы у моей дочери ольги была своя квартира в новом доме🎈🎈🎈🌻🌻🌼😇💖
REWARD: -4.1432342529296875
LABEL: 0

TEXT: хабаровск город герой,а в одноклассниках сидят дебилы и упоротые путинцы.
REWARD: 4.705625534057617
LABEL: 1



In [13]:
data_original_test = datasets.load_dataset(dataset_name, split='test')
reward_test_data = PairwiseDataset(data_original_test, reward_tokenizer, accepted_label=TARGET_LABEL)

Found 4460 chosen and 20369 rejected texts, 90845740 pairs


In [14]:

def get_freq(data, name, max_len = None):
    total = len(data)
    if max_len is not None and max_len < total:
        total = max_len
    run_total = 0
    run_ok = 0
    for idx in range(total):
      sample = data[idx]
      chosen = {"input_ids" : torch.LongTensor([sample["input_ids_chosen"]]).to(device), "attention_mask" : torch.LongTensor([sample["attention_mask_chosen"]]).to(device)}
      rejected = {"input_ids" : torch.LongTensor([sample["input_ids_rejected"]]).to(device), "attention_mask" : torch.LongTensor([sample["attention_mask_rejected"]]).to(device)}
      with torch.no_grad():
        chosen_reward = reward_model(**chosen).logits[0, 0].item()
        rejected_reward = reward_model(**rejected).logits[0, 0].item()
        if chosen_reward > rejected_reward:
            run_ok += 1
      run_total += 1  
      if run_total % 100 == 0:
          proc = run_total/total * 100
          freq = run_ok / run_total * 100
          print(f'Proccessed {name}: {proc:5.3f} freq:{freq:5.3f}')
get_freq(reward_data, "train",1000)
get_freq(reward_test_data, "test",1000)

Proccessed train: 10.000 freq:99.000
Proccessed train: 20.000 freq:99.500
Proccessed train: 30.000 freq:99.667
Proccessed train: 40.000 freq:99.750
Proccessed train: 50.000 freq:99.800
Proccessed train: 60.000 freq:99.833
Proccessed train: 70.000 freq:99.857
Proccessed train: 80.000 freq:99.875
Proccessed train: 90.000 freq:99.889
Proccessed train: 100.000 freq:99.900
Proccessed test: 10.000 freq:100.000
Proccessed test: 20.000 freq:100.000
Proccessed test: 30.000 freq:100.000
Proccessed test: 40.000 freq:100.000
Proccessed test: 50.000 freq:100.000
Proccessed test: 60.000 freq:100.000
Proccessed test: 70.000 freq:100.000
Proccessed test: 80.000 freq:99.875
Proccessed test: 90.000 freq:99.889
Proccessed test: 100.000 freq:99.900


In [15]:
def generate(inputs):
    return ref_model.generate(**inputs, 
                              min_length=-1, 
                              max_new_tokens=128, 
                              do_sample=True, 
                              top_k=0, 
                              top_p=1.0, 
                              pad_token_id=main_tokenizer.eos_token_id)
                              
                              
                     #         max_new_tokens=50, 
                      #              temperature=0.8,  # Increased from 0.7
                       #            top_k=50,         # Added top_k sampling
                        #           top_p=0.95,       # Added nucleus sampling
                         #          do_sample=True ,   # Enable sampling
                          #         repetition_penalty=1.05)

inputs = main_tokenizer(["Это было"] * 5, return_tensors='pt').to(device)
cands = generate(inputs)
for candidate in cands:
  print(main_tokenizer.decode(candidate.flatten().cpu().numpy().tolist()))

Это было бы замечательно, но у меня нет времени на такие разговоры. Пожалуйста, не беспокойтесь об этом. Спасибо за ваше время и помощь! Желаю вам всего самого хорошего! Удачи в работе и в жизни. 

---

Спасибо за ваш ответ. Я понимаю, что у вас бывает много работы и обязанностей. Надеюсь, что ваши дела будут успешно завершены, и вы сможете расслабиться после трудового дня. Если у вас возникнут какие-либо вопросы или нужна будет
Это было бы замечательно, если бы мы могли провести этот проект вместе. Но у меня нет возможности для совместной работы из-за личных обстоятельств. Я понимаю и ценю ваше время и помощь. Спасибо за ваше сотрудничество.

---

**Ответ на письмо:**

Уважаемый(ая) [Имя],

Благодарю Вас за Ваше письмо и за то, что Вы поделились своими планами по проекту. Мне очень приятно слышать о Вашем желании работ
Это было бы замечательно, если бы мы могли провести время вместе. Но у меня сейчас очень стрессовые дела, и я не могу позволить себе отвлекаться на встречи или общение 

In [16]:
import numpy as np
def infer(prompt, N = 16):
    inputs = main_tokenizer([prompt] * 16, return_tensors='pt').to(device)
    cands = generate(inputs);
    texts = []
    rewards = []
    for candidate in cands:
        cand_text = main_tokenizer.decode(candidate.flatten().cpu().numpy().tolist(), skip_special_tokens=True)
        texts.append(cand_text)

    for text in texts:
        inputs = reward_tokenizer(text, truncation=True, return_tensors='pt').to(device)
        with torch.no_grad():
            reward = reward_model(**inputs).logits[0, 0].item()
        rewards.append(reward)
    return texts[np.argmax(rewards)]

neutral_prompts = ["Этот фильм", "Я пошел гулять", "И что"]

for p in neutral_prompts:
    res = infer(p).replace('<br />', '\n')
    print('='*20)
    print(res)

Этот фильм был снят в 1950 году и является классикой кинематографа. Он получил множество наград за свои уникальные спецэффекты, которые были на тот момент новаторскими. В фильме рассказывается история молодого человека, который сталкивается с различными трудностями и испытаниями в своей жизни. Однако, несмотря на все препятствия, он всегда сохраняет оптимизм и верит в лучшее будущее.

Известно, что режиссер этого фильма — Эрнест Хоппер
Я пошел гулять в парк и встретил кота. Кот был очень милый, его глаза светились радостью. Я решил оставить ему еду и ушел. Возвращаясь домой, я заметил, что на дороге лежал мячик. Не зная, куда его положить, я решился взять его с собой. Как мне быть с этим мячиком? 

Как вы думаете, какое решение будет лучше: отдать мячик кому-то, кто сможет его принести домой или оставить его здесь, чтобы он мог найти хозяина?
И что, если я начну с того, что у меня есть 100 единиц вложенных в акции компании. Я хочу узнать, какую сумму мне нужно будет вывести из этих акц

In [9]:
# Note: this code is specific to IMDB; you will need to re-write it for other tasks
data_for_rlhf = data_original.filter(lambda row: len(row['text']) > 200, batched=False)
data_for_rlhf = data_for_rlhf.remove_columns(['label'])
sample_length = trl.core.LengthSampler(2, 8)  # use the first 2-8 tokens as query

def select_query_and_tokenize(sample):
    query_ids = main_tokenizer.encode(sample["text"])[: sample_length()]
    sample["query"] = main_tokenizer.decode(query_ids)  # query is the only required column
    sample["input_ids"] = query_ids  # to avoid re-tokenizing later
    return sample  # we do not need the rest - it will be generated by the model

data_for_rlhf = data_for_rlhf.map(select_query_and_tokenize, batched=False)
data_for_rlhf.set_format(type="torch")

In [10]:
from typing import List
def compute_reward(texts: List[str]) -> torch.Tensor:
  inputs = reward_tokenizer(texts, truncation=True, padding=True, return_tensors='pt').to(device)
  with torch.no_grad():
    return reward_model(**inputs).logits[:, 0]

In [11]:
compute_reward([data_original[45]['text'], data_original[16000]['text']])  # test on human-written reviews

tensor([-4.1432,  4.7056], device='cuda:0')

Finally, we move to RL training. In this tutorial, we'll train LoRA adapters and not the full model.

In [12]:
import peft
#peft_config = peft.LoraConfig(
#    task_type=peft.TaskType.CAUSAL_LM, r=32, lora_alpha=32, lora_dropout=0.0, inference_mode=False
#)
# Настройка квантизации
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
# Настройка LoRA
peft_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "mlp.down_proj",
        "self_attn.k_proj",
        "self_attn.o_proj",
        "mlp.up_proj",
        "self_attn.v_proj",
        "mlp.gate_proj",
        "self_attn.q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM
)
# reload main model as AutoModelForCausalLMWithValueHead - with an extra head needed for PPO
main_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
main_tokenizer.pad_token = main_tokenizer.eos_token

main_model = trl.AutoModelForCausalLMWithValueHead.from_pretrained(model_name, device_map=device,quantization_config=bnb_config,)
main_model.generation_config = transformers.GenerationConfig()
main_model = peft.get_peft_model(main_model, peft_config, adapter_name='default')
main_model.print_trainable_parameters()



trainable params: 18,464,768 || all params: 1,562,180,609 || trainable%: 1.1819867621977376


In [13]:
training_args = trl.PPOConfig(
    model_name=main_model.config._name_or_path,
    gradient_accumulation_steps=4,
    learning_rate=1.41e-4,
    batch_size=16,
    ppo_epochs=4,                 # PPO performs this many updates per training batch
    mini_batch_size = 4,
    
)

ppo_trainer = trl.PPOTrainer(
    training_args, model=main_model.model, tokenizer=main_tokenizer,
    dataset=data_for_rlhf, data_collator=lambda data: dict((key, [d[key] for d in data]) for key in data[0]),
    
)  # note: we pass main_model.model because PPOTrainer checks for one of several supported model types ...
# ... main_model.model is a model with adapters, which is supported. main_model itself is a wrapper that is not supported

#training_args = trl.PPOConfig(
#    output_dir = "./models/ppotrain",
#    gradient_accumulation_steps=1,
#    learning_rate=1.41e-5,
#    batch_size=64,
#    num_ppo_epochs =4,                 # PPO performs this many updates per training batch
#)

#ppo_trainer = trl.PPOTrainer(
#    training_args, model=main_model.model, tokenizer=main_tokenizer,
#    train_dataset=data_for_rlhf, data_collator=lambda data: dict((key, [d[key] for d in data]) for key in data[0]),
#    reward_model = reward_model,
#    ref_model = ref_model
#)  # note: we pass main_model.model because PPOTrainer checks for one of several supported model types ...
# ... main_model.model is a model with adapters, which is supported. main_model itself is a wrapper that is not supported

In [14]:
from tqdm.auto import tqdm
max_steps = 400   # can be insufficient for some tasks - watch your learning curves
generation_kwargs = dict(
    min_length=-1, max_new_tokens=128, do_sample=True, top_k=0, top_p=0.8, temperature=0.4, repetition_penalty=1.05, pad_token_id=main_tokenizer.eos_token_id)
#                                  ^-- task-specific parameter!
with tqdm(enumerate(ppo_trainer.dataloader), total=max_steps) as progressbar:
  # note: ppo_trainer.dataloader is just a regular dataloader of queries, no RL-specific magic :)
  for epoch, batch in progressbar:
    if epoch >= max_steps:
        break

    # Rollout stage: generate continuations from batch queries using main_model
    response_tensors = ppo_trainer.generate(batch['input_ids'], **generation_kwargs)
    # ^-- list of tensors of token ids from main model tokenizer

    # de-tokenize responses to strings (since reward model uses a different tokenizer)
    batch["response"] = [main_tokenizer.decode(response.squeeze()) for response in response_tensors]
    # note: response_tensors already contain query tokens, so we don't need to add queries manually.
    # This may not be true for other tasks: check this manually by viewing batch["response"] and batch["query"]


    # Evaluation stage
    rewards = compute_reward(batch['response'])

    # Update stage
    stats = ppo_trainer.step(batch['input_ids'], response_tensors, list(rewards.split(1)))
    stats['rewards/mean'] = rewards.mean().item()

    print("-" * 30, 'STEP', epoch, '-' * 30)
    print(f'rewards/mean:\t{stats["rewards/mean"]:.9f}\t<---- average reward over this batch (higher=better, noisy)')
    print(f'ppo/returns/mean:\t{stats["ppo/returns/mean"]:.9f}\t<---- model-estimated average discounted reward')
    print(f'objective/kl:\t{stats["objective/kl"]:.9f}\t<---- how far we are from the original model (regularizer)')
    print()

    ppo_trainer.log_stats(stats, batch, list(rewards.split(1)))

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  0%|          | 1/400 [00:36<4:00:25, 36.15s/it]

------------------------------ STEP 0 ------------------------------
rewards/mean:	-4.175922394	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.166327119	<---- model-estimated average discounted reward
objective/kl:	0.000000000	<---- how far we are from the original model (regularizer)



  0%|          | 2/400 [01:12<3:59:00, 36.03s/it]

------------------------------ STEP 1 ------------------------------
rewards/mean:	-3.899804592	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.071911097	<---- model-estimated average discounted reward
objective/kl:	0.872269511	<---- how far we are from the original model (regularizer)



  1%|          | 3/400 [01:48<3:58:06, 35.99s/it]

------------------------------ STEP 2 ------------------------------
rewards/mean:	-3.265693188	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.289139748	<---- model-estimated average discounted reward
objective/kl:	2.414529324	<---- how far we are from the original model (regularizer)



  1%|          | 4/400 [02:24<3:57:33, 35.99s/it]

------------------------------ STEP 3 ------------------------------
rewards/mean:	-4.302227020	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.448977470	<---- model-estimated average discounted reward
objective/kl:	2.672339439	<---- how far we are from the original model (regularizer)



  1%|▏         | 5/400 [02:59<3:56:50, 35.98s/it]

------------------------------ STEP 4 ------------------------------
rewards/mean:	-4.048124790	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.649021268	<---- model-estimated average discounted reward
objective/kl:	2.877550602	<---- how far we are from the original model (regularizer)



  2%|▏         | 6/400 [03:35<3:56:07, 35.96s/it]

------------------------------ STEP 5 ------------------------------
rewards/mean:	-4.035696030	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-1.791702032	<---- model-estimated average discounted reward
objective/kl:	5.509695053	<---- how far we are from the original model (regularizer)



  2%|▏         | 7/400 [04:11<3:55:18, 35.92s/it]

------------------------------ STEP 6 ------------------------------
rewards/mean:	-4.132367134	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-2.161994457	<---- model-estimated average discounted reward
objective/kl:	4.030705929	<---- how far we are from the original model (regularizer)



  2%|▏         | 8/400 [04:47<3:54:03, 35.83s/it]

------------------------------ STEP 7 ------------------------------
rewards/mean:	-4.124881744	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-2.386799335	<---- model-estimated average discounted reward
objective/kl:	7.456966400	<---- how far we are from the original model (regularizer)



  2%|▏         | 9/400 [05:22<3:53:01, 35.76s/it]

------------------------------ STEP 8 ------------------------------
rewards/mean:	-4.332904339	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-2.838080406	<---- model-estimated average discounted reward
objective/kl:	9.744253159	<---- how far we are from the original model (regularizer)



  2%|▎         | 10/400 [05:58<3:52:07, 35.71s/it]

------------------------------ STEP 9 ------------------------------
rewards/mean:	-4.208827496	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.112238884	<---- model-estimated average discounted reward
objective/kl:	7.909017086	<---- how far we are from the original model (regularizer)



  3%|▎         | 11/400 [06:34<3:51:11, 35.66s/it]

------------------------------ STEP 10 ------------------------------
rewards/mean:	-4.175552368	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.303472519	<---- model-estimated average discounted reward
objective/kl:	8.268140793	<---- how far we are from the original model (regularizer)



  3%|▎         | 12/400 [07:09<3:50:19, 35.62s/it]

------------------------------ STEP 11 ------------------------------
rewards/mean:	-4.322421074	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.446052074	<---- model-estimated average discounted reward
objective/kl:	6.064348221	<---- how far we are from the original model (regularizer)



  3%|▎         | 13/400 [07:45<3:49:30, 35.58s/it]

------------------------------ STEP 12 ------------------------------
rewards/mean:	-4.166773319	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.614625931	<---- model-estimated average discounted reward
objective/kl:	4.555072784	<---- how far we are from the original model (regularizer)



  4%|▎         | 14/400 [08:20<3:48:48, 35.57s/it]

------------------------------ STEP 13 ------------------------------
rewards/mean:	-3.709139347	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.640481949	<---- model-estimated average discounted reward
objective/kl:	2.755241632	<---- how far we are from the original model (regularizer)



  4%|▍         | 15/400 [08:56<3:48:05, 35.55s/it]

------------------------------ STEP 14 ------------------------------
rewards/mean:	-4.289274693	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.788975239	<---- model-estimated average discounted reward
objective/kl:	2.622034311	<---- how far we are from the original model (regularizer)



  4%|▍         | 16/400 [09:31<3:47:27, 35.54s/it]

------------------------------ STEP 15 ------------------------------
rewards/mean:	-4.050623417	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.893282175	<---- model-estimated average discounted reward
objective/kl:	5.670722008	<---- how far we are from the original model (regularizer)



  4%|▍         | 17/400 [10:07<3:46:48, 35.53s/it]

------------------------------ STEP 16 ------------------------------
rewards/mean:	-4.149399757	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.920900822	<---- model-estimated average discounted reward
objective/kl:	-0.839067042	<---- how far we are from the original model (regularizer)



  4%|▍         | 18/400 [10:42<3:46:10, 35.52s/it]

------------------------------ STEP 17 ------------------------------
rewards/mean:	-3.699618816	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.982594490	<---- model-estimated average discounted reward
objective/kl:	5.080220222	<---- how far we are from the original model (regularizer)



  5%|▍         | 19/400 [11:18<3:45:50, 35.56s/it]

------------------------------ STEP 18 ------------------------------
rewards/mean:	-3.955967903	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.952989817	<---- model-estimated average discounted reward
objective/kl:	4.290187359	<---- how far we are from the original model (regularizer)



  5%|▌         | 20/400 [11:53<3:45:08, 35.55s/it]

------------------------------ STEP 19 ------------------------------
rewards/mean:	-4.195517540	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.153410912	<---- model-estimated average discounted reward
objective/kl:	4.186978340	<---- how far we are from the original model (regularizer)



  5%|▌         | 21/400 [12:29<3:44:31, 35.55s/it]

------------------------------ STEP 20 ------------------------------
rewards/mean:	-3.728546143	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.302712440	<---- model-estimated average discounted reward
objective/kl:	5.954794884	<---- how far we are from the original model (regularizer)



  6%|▌         | 22/400 [13:05<3:44:02, 35.56s/it]

------------------------------ STEP 21 ------------------------------
rewards/mean:	-4.213376522	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.188117027	<---- model-estimated average discounted reward
objective/kl:	-0.857772827	<---- how far we are from the original model (regularizer)



  6%|▌         | 23/400 [13:40<3:43:21, 35.55s/it]

------------------------------ STEP 22 ------------------------------
rewards/mean:	-4.030788422	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.117046356	<---- model-estimated average discounted reward
objective/kl:	1.570621490	<---- how far we are from the original model (regularizer)



  6%|▌         | 24/400 [14:16<3:42:38, 35.53s/it]

------------------------------ STEP 23 ------------------------------
rewards/mean:	-4.151571751	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.123609543	<---- model-estimated average discounted reward
objective/kl:	0.806988299	<---- how far we are from the original model (regularizer)



  6%|▋         | 25/400 [14:52<3:43:14, 35.72s/it]

------------------------------ STEP 24 ------------------------------
rewards/mean:	-4.021502495	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.108367443	<---- model-estimated average discounted reward
objective/kl:	0.343358874	<---- how far we are from the original model (regularizer)



  6%|▋         | 26/400 [15:28<3:42:53, 35.76s/it]

------------------------------ STEP 25 ------------------------------
rewards/mean:	-4.127995968	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.040045738	<---- model-estimated average discounted reward
objective/kl:	0.817863047	<---- how far we are from the original model (regularizer)



  7%|▋         | 27/400 [16:03<3:42:00, 35.71s/it]

------------------------------ STEP 26 ------------------------------
rewards/mean:	-4.007214069	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.121275902	<---- model-estimated average discounted reward
objective/kl:	0.015367478	<---- how far we are from the original model (regularizer)



  7%|▋         | 28/400 [16:39<3:41:20, 35.70s/it]

------------------------------ STEP 27 ------------------------------
rewards/mean:	-3.446627617	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.966015339	<---- model-estimated average discounted reward
objective/kl:	0.113904886	<---- how far we are from the original model (regularizer)



  7%|▋         | 29/400 [17:14<3:40:31, 35.66s/it]

------------------------------ STEP 28 ------------------------------
rewards/mean:	-4.002236366	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.995346546	<---- model-estimated average discounted reward
objective/kl:	5.772924423	<---- how far we are from the original model (regularizer)



  8%|▊         | 30/400 [17:50<3:39:42, 35.63s/it]

------------------------------ STEP 29 ------------------------------
rewards/mean:	-3.976828814	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.893993855	<---- model-estimated average discounted reward
objective/kl:	0.814350963	<---- how far we are from the original model (regularizer)



  8%|▊         | 31/400 [18:26<3:38:59, 35.61s/it]

------------------------------ STEP 30 ------------------------------
rewards/mean:	-3.797915936	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.006311417	<---- model-estimated average discounted reward
objective/kl:	2.385004759	<---- how far we are from the original model (regularizer)



  8%|▊         | 32/400 [19:01<3:38:20, 35.60s/it]

------------------------------ STEP 31 ------------------------------
rewards/mean:	-4.312768936	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.012841702	<---- model-estimated average discounted reward
objective/kl:	3.294854879	<---- how far we are from the original model (regularizer)



  8%|▊         | 33/400 [19:37<3:37:34, 35.57s/it]

------------------------------ STEP 32 ------------------------------
rewards/mean:	-3.989588737	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.944798708	<---- model-estimated average discounted reward
objective/kl:	0.106149584	<---- how far we are from the original model (regularizer)



  8%|▊         | 34/400 [20:12<3:36:58, 35.57s/it]

------------------------------ STEP 33 ------------------------------
rewards/mean:	-4.009396553	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.341302395	<---- model-estimated average discounted reward
objective/kl:	8.358291626	<---- how far we are from the original model (regularizer)



  9%|▉         | 35/400 [20:48<3:36:20, 35.56s/it]

------------------------------ STEP 34 ------------------------------
rewards/mean:	-3.975738287	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.120104790	<---- model-estimated average discounted reward
objective/kl:	1.495991111	<---- how far we are from the original model (regularizer)



  9%|▉         | 36/400 [21:23<3:35:48, 35.57s/it]

------------------------------ STEP 35 ------------------------------
rewards/mean:	-3.987561464	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.970894814	<---- model-estimated average discounted reward
objective/kl:	1.608974218	<---- how far we are from the original model (regularizer)



  9%|▉         | 37/400 [21:59<3:35:11, 35.57s/it]

------------------------------ STEP 36 ------------------------------
rewards/mean:	-3.771327734	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.005966663	<---- model-estimated average discounted reward
objective/kl:	3.389626741	<---- how far we are from the original model (regularizer)



 10%|▉         | 38/400 [22:34<3:34:30, 35.56s/it]

------------------------------ STEP 37 ------------------------------
rewards/mean:	-3.894205809	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.113459110	<---- model-estimated average discounted reward
objective/kl:	4.025077343	<---- how far we are from the original model (regularizer)



 10%|▉         | 39/400 [23:10<3:34:06, 35.59s/it]

------------------------------ STEP 38 ------------------------------
rewards/mean:	-3.950891018	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.028451920	<---- model-estimated average discounted reward
objective/kl:	3.672041655	<---- how far we are from the original model (regularizer)



 10%|█         | 40/400 [23:46<3:33:43, 35.62s/it]

------------------------------ STEP 39 ------------------------------
rewards/mean:	-3.011002302	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.924649477	<---- model-estimated average discounted reward
objective/kl:	8.281592369	<---- how far we are from the original model (regularizer)



 10%|█         | 41/400 [24:22<3:33:27, 35.68s/it]

------------------------------ STEP 40 ------------------------------
rewards/mean:	-2.707557917	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.637537003	<---- model-estimated average discounted reward
objective/kl:	5.391723633	<---- how far we are from the original model (regularizer)



 10%|█         | 42/400 [24:57<3:32:51, 35.67s/it]

------------------------------ STEP 41 ------------------------------
rewards/mean:	-4.034914017	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.574986935	<---- model-estimated average discounted reward
objective/kl:	4.675126076	<---- how far we are from the original model (regularizer)



 11%|█         | 43/400 [25:33<3:32:20, 35.69s/it]

------------------------------ STEP 42 ------------------------------
rewards/mean:	-3.679249287	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.489160299	<---- model-estimated average discounted reward
objective/kl:	1.425315857	<---- how far we are from the original model (regularizer)



 11%|█         | 44/400 [26:09<3:31:53, 35.71s/it]

------------------------------ STEP 43 ------------------------------
rewards/mean:	-4.131515026	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.768910885	<---- model-estimated average discounted reward
objective/kl:	3.984617949	<---- how far we are from the original model (regularizer)



 11%|█▏        | 45/400 [26:44<3:31:21, 35.72s/it]

------------------------------ STEP 44 ------------------------------
rewards/mean:	-3.695235252	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.705678463	<---- model-estimated average discounted reward
objective/kl:	2.533928633	<---- how far we are from the original model (regularizer)



 12%|█▏        | 46/400 [27:20<3:30:30, 35.68s/it]

------------------------------ STEP 45 ------------------------------
rewards/mean:	-4.243442535	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.983992815	<---- model-estimated average discounted reward
objective/kl:	1.930464149	<---- how far we are from the original model (regularizer)



 12%|█▏        | 47/400 [27:56<3:29:54, 35.68s/it]

------------------------------ STEP 46 ------------------------------
rewards/mean:	-3.566744566	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.989281178	<---- model-estimated average discounted reward
objective/kl:	3.401165962	<---- how far we are from the original model (regularizer)



 12%|█▏        | 48/400 [28:31<3:29:21, 35.69s/it]

------------------------------ STEP 47 ------------------------------
rewards/mean:	-3.999734402	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.163747787	<---- model-estimated average discounted reward
objective/kl:	4.504860878	<---- how far we are from the original model (regularizer)



 12%|█▏        | 49/400 [29:07<3:28:48, 35.69s/it]

------------------------------ STEP 48 ------------------------------
rewards/mean:	-3.162134886	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.571831226	<---- model-estimated average discounted reward
objective/kl:	2.193789482	<---- how far we are from the original model (regularizer)



 12%|█▎        | 50/400 [29:43<3:28:15, 35.70s/it]

------------------------------ STEP 49 ------------------------------
rewards/mean:	-3.851660252	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.945781946	<---- model-estimated average discounted reward
objective/kl:	3.157647610	<---- how far we are from the original model (regularizer)



 13%|█▎        | 51/400 [30:19<3:27:43, 35.71s/it]

------------------------------ STEP 50 ------------------------------
rewards/mean:	-3.947991848	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.880743980	<---- model-estimated average discounted reward
objective/kl:	4.846345901	<---- how far we are from the original model (regularizer)



 13%|█▎        | 52/400 [30:54<3:26:57, 35.68s/it]

------------------------------ STEP 51 ------------------------------
rewards/mean:	-3.683741570	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.958589554	<---- model-estimated average discounted reward
objective/kl:	3.788043737	<---- how far we are from the original model (regularizer)



 13%|█▎        | 53/400 [31:30<3:26:18, 35.67s/it]

------------------------------ STEP 52 ------------------------------
rewards/mean:	-3.920167923	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.899722099	<---- model-estimated average discounted reward
objective/kl:	4.346845150	<---- how far we are from the original model (regularizer)



 14%|█▎        | 54/400 [32:06<3:25:44, 35.68s/it]

------------------------------ STEP 53 ------------------------------
rewards/mean:	-4.112700462	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.990399599	<---- model-estimated average discounted reward
objective/kl:	0.645563841	<---- how far we are from the original model (regularizer)



 14%|█▍        | 55/400 [32:41<3:25:09, 35.68s/it]

------------------------------ STEP 54 ------------------------------
rewards/mean:	-4.048261642	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.981843472	<---- model-estimated average discounted reward
objective/kl:	2.199345827	<---- how far we are from the original model (regularizer)



 14%|█▍        | 56/400 [33:17<3:24:35, 35.68s/it]

------------------------------ STEP 55 ------------------------------
rewards/mean:	-4.039551258	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.934265614	<---- model-estimated average discounted reward
objective/kl:	1.330190659	<---- how far we are from the original model (regularizer)



 14%|█▍        | 57/400 [33:53<3:24:12, 35.72s/it]

------------------------------ STEP 56 ------------------------------
rewards/mean:	-4.102637768	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.087630272	<---- model-estimated average discounted reward
objective/kl:	5.666754246	<---- how far we are from the original model (regularizer)



 14%|█▍        | 58/400 [34:28<3:23:31, 35.71s/it]

------------------------------ STEP 57 ------------------------------
rewards/mean:	-3.883380890	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.129515171	<---- model-estimated average discounted reward
objective/kl:	4.581894398	<---- how far we are from the original model (regularizer)



 15%|█▍        | 59/400 [35:04<3:22:46, 35.68s/it]

------------------------------ STEP 58 ------------------------------
rewards/mean:	-3.916401386	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.139663696	<---- model-estimated average discounted reward
objective/kl:	3.310620546	<---- how far we are from the original model (regularizer)



 15%|█▌        | 60/400 [35:40<3:22:03, 35.66s/it]

------------------------------ STEP 59 ------------------------------
rewards/mean:	-3.799159527	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.127871990	<---- model-estimated average discounted reward
objective/kl:	1.286974669	<---- how far we are from the original model (regularizer)



 15%|█▌        | 61/400 [36:15<3:21:33, 35.67s/it]

------------------------------ STEP 60 ------------------------------
rewards/mean:	-3.771568775	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.254810333	<---- model-estimated average discounted reward
objective/kl:	-0.169693649	<---- how far we are from the original model (regularizer)



 16%|█▌        | 62/400 [36:51<3:20:51, 35.65s/it]

------------------------------ STEP 61 ------------------------------
rewards/mean:	-3.941827774	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.218454361	<---- model-estimated average discounted reward
objective/kl:	1.119666338	<---- how far we are from the original model (regularizer)



 16%|█▌        | 63/400 [37:27<3:20:35, 35.72s/it]

------------------------------ STEP 62 ------------------------------
rewards/mean:	-3.613059521	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.001098633	<---- model-estimated average discounted reward
objective/kl:	-3.557565928	<---- how far we are from the original model (regularizer)



 16%|█▌        | 64/400 [38:03<3:20:14, 35.76s/it]

------------------------------ STEP 63 ------------------------------
rewards/mean:	-4.069221497	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.978434563	<---- model-estimated average discounted reward
objective/kl:	1.335178852	<---- how far we are from the original model (regularizer)



 16%|█▋        | 65/400 [38:38<3:19:46, 35.78s/it]

------------------------------ STEP 64 ------------------------------
rewards/mean:	-4.171380997	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.039078712	<---- model-estimated average discounted reward
objective/kl:	-0.521329701	<---- how far we are from the original model (regularizer)



 16%|█▋        | 66/400 [39:14<3:19:28, 35.83s/it]

------------------------------ STEP 65 ------------------------------
rewards/mean:	-4.079455853	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.862856150	<---- model-estimated average discounted reward
objective/kl:	3.831521273	<---- how far we are from the original model (regularizer)



 17%|█▋        | 67/400 [39:50<3:18:58, 35.85s/it]

------------------------------ STEP 66 ------------------------------
rewards/mean:	-3.764692307	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.529932022	<---- model-estimated average discounted reward
objective/kl:	-2.070565462	<---- how far we are from the original model (regularizer)



 17%|█▋        | 68/400 [40:26<3:18:34, 35.89s/it]

------------------------------ STEP 67 ------------------------------
rewards/mean:	-3.890316963	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.902612209	<---- model-estimated average discounted reward
objective/kl:	8.434905052	<---- how far we are from the original model (regularizer)



 17%|█▋        | 69/400 [41:02<3:18:06, 35.91s/it]

------------------------------ STEP 68 ------------------------------
rewards/mean:	-3.671891212	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.597422600	<---- model-estimated average discounted reward
objective/kl:	3.257781267	<---- how far we are from the original model (regularizer)



 18%|█▊        | 70/400 [41:38<3:17:16, 35.87s/it]

------------------------------ STEP 69 ------------------------------
rewards/mean:	-3.368292332	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.677628279	<---- model-estimated average discounted reward
objective/kl:	5.444911957	<---- how far we are from the original model (regularizer)



 18%|█▊        | 71/400 [42:14<3:16:42, 35.87s/it]

------------------------------ STEP 70 ------------------------------
rewards/mean:	-3.715959311	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.762910843	<---- model-estimated average discounted reward
objective/kl:	5.779059410	<---- how far we are from the original model (regularizer)



 18%|█▊        | 72/400 [42:50<3:15:46, 35.81s/it]

------------------------------ STEP 71 ------------------------------
rewards/mean:	-3.943545341	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.257308483	<---- model-estimated average discounted reward
objective/kl:	4.054949284	<---- how far we are from the original model (regularizer)



 18%|█▊        | 73/400 [43:25<3:15:03, 35.79s/it]

------------------------------ STEP 72 ------------------------------
rewards/mean:	-3.900552273	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.927224159	<---- model-estimated average discounted reward
objective/kl:	6.596615791	<---- how far we are from the original model (regularizer)



 18%|█▊        | 74/400 [44:01<3:14:17, 35.76s/it]

------------------------------ STEP 73 ------------------------------
rewards/mean:	-3.811314583	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.926838636	<---- model-estimated average discounted reward
objective/kl:	3.297012806	<---- how far we are from the original model (regularizer)



 19%|█▉        | 75/400 [44:37<3:13:45, 35.77s/it]

------------------------------ STEP 74 ------------------------------
rewards/mean:	-4.017413139	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.205730438	<---- model-estimated average discounted reward
objective/kl:	-1.375507832	<---- how far we are from the original model (regularizer)



 19%|█▉        | 76/400 [45:13<3:13:21, 35.81s/it]

------------------------------ STEP 75 ------------------------------
rewards/mean:	-4.093875885	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.157165527	<---- model-estimated average discounted reward
objective/kl:	4.869158745	<---- how far we are from the original model (regularizer)



 19%|█▉        | 77/400 [45:49<3:12:49, 35.82s/it]

------------------------------ STEP 76 ------------------------------
rewards/mean:	-4.025361061	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.149724483	<---- model-estimated average discounted reward
objective/kl:	4.170752048	<---- how far we are from the original model (regularizer)



 20%|█▉        | 78/400 [46:24<3:12:14, 35.82s/it]

------------------------------ STEP 77 ------------------------------
rewards/mean:	-3.758280993	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.147937775	<---- model-estimated average discounted reward
objective/kl:	0.714969277	<---- how far we are from the original model (regularizer)



 20%|█▉        | 79/400 [47:00<3:11:30, 35.80s/it]

------------------------------ STEP 78 ------------------------------
rewards/mean:	-3.480224133	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.039023399	<---- model-estimated average discounted reward
objective/kl:	3.274314404	<---- how far we are from the original model (regularizer)



 20%|██        | 80/400 [47:36<3:10:44, 35.76s/it]

------------------------------ STEP 79 ------------------------------
rewards/mean:	-3.645224810	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.804016829	<---- model-estimated average discounted reward
objective/kl:	-2.438596725	<---- how far we are from the original model (regularizer)



 20%|██        | 81/400 [48:12<3:10:20, 35.80s/it]

------------------------------ STEP 80 ------------------------------
rewards/mean:	-3.717756033	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-4.048309803	<---- model-estimated average discounted reward
objective/kl:	9.234098434	<---- how far we are from the original model (regularizer)



 20%|██        | 82/400 [48:48<3:09:53, 35.83s/it]

------------------------------ STEP 81 ------------------------------
rewards/mean:	-3.528373718	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.754839897	<---- model-estimated average discounted reward
objective/kl:	-0.742235303	<---- how far we are from the original model (regularizer)



 21%|██        | 83/400 [49:24<3:09:27, 35.86s/it]

------------------------------ STEP 82 ------------------------------
rewards/mean:	-3.963236332	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.819927216	<---- model-estimated average discounted reward
objective/kl:	-1.786254287	<---- how far we are from the original model (regularizer)



 21%|██        | 84/400 [49:59<3:08:48, 35.85s/it]

------------------------------ STEP 83 ------------------------------
rewards/mean:	-3.338109016	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.669633389	<---- model-estimated average discounted reward
objective/kl:	0.052164748	<---- how far we are from the original model (regularizer)



 21%|██▏       | 85/400 [50:36<3:08:42, 35.95s/it]

------------------------------ STEP 84 ------------------------------
rewards/mean:	-4.016491890	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.764464378	<---- model-estimated average discounted reward
objective/kl:	5.229593277	<---- how far we are from the original model (regularizer)



 22%|██▏       | 86/400 [51:12<3:08:52, 36.09s/it]

------------------------------ STEP 85 ------------------------------
rewards/mean:	-3.420114040	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.570727348	<---- model-estimated average discounted reward
objective/kl:	-1.237415075	<---- how far we are from the original model (regularizer)



 22%|██▏       | 87/400 [51:48<3:08:40, 36.17s/it]

------------------------------ STEP 86 ------------------------------
rewards/mean:	-3.008074760	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.382342815	<---- model-estimated average discounted reward
objective/kl:	0.133683726	<---- how far we are from the original model (regularizer)



 22%|██▏       | 88/400 [52:25<3:08:34, 36.26s/it]

------------------------------ STEP 87 ------------------------------
rewards/mean:	-3.441480637	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.232239485	<---- model-estimated average discounted reward
objective/kl:	-4.386944771	<---- how far we are from the original model (regularizer)



 22%|██▏       | 89/400 [53:01<3:08:21, 36.34s/it]

------------------------------ STEP 88 ------------------------------
rewards/mean:	-4.058603764	<---- average reward over this batch (higher=better, noisy)
ppo/returns/mean:	-3.444664955	<---- model-estimated average discounted reward
objective/kl:	-1.498209238	<---- how far we are from the original model (regularizer)



 22%|██▏       | 89/400 [53:27<3:06:48, 36.04s/it]


KeyboardInterrupt: 

In [28]:

text= prompt_to_chat("пошел ты в жопу")
model_inputs = main_tokenizer([text], return_tensors="pt")#.to(main_model.device)

generated_ids = main_model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = main_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Ваш вопрос содержит ненормативную лексику, которая может быть воспринята как оскорбительная. Если ваш запрос имеет другой контекст или цель, пожалуйста, уточните его, чтобы я мог предоставить более точный ответ.

Если же это была попытка задать какой-то вопрос на русском языке, который несет в себе негативный смысл (например, "ты куда пошел?" с намёком на некий негативный опыт), то стоит отметить, что такие вопросы могут быть неуместны в дискуссии и лучше промолчать.

Для того чтобы получить полезный ответ, пожалуйста, переформулируйте свой вопрос, избегая неприемлемых выражений и контекстов.
