In [1]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import torch.optim as optim
from torch import nn

import json 
import random 

from datasets import load_dataset

torch.cuda.empty_cache()



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary c:\Python311\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.to(device)

# IMPORTANT: Update the padding token ID in the model configuration
model.config.pad_token_id = model.config.eos_token_id

# Access the config to get the context size (max_position_embeddings)
context_size = model.config.max_position_embeddings
print(f"The context size of this model is {context_size} tokens.")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The context size of this model is 1024 tokens.


### Load dataset from HF

The HG dataset we expect should have: prompt, chosen, rejected

In [4]:
# Load the SQuAD dataset
dataset = load_dataset("yitingxie/rlhf-reward-datasets")

train_dataset = dataset['train']
test_dataset = dataset['test']

# Shuffle the indices
total_samples = len(dataset["train"])
all_indices = list(range(total_samples))
random.shuffle(all_indices)

# Select 10,000 random indices
selected_indices = all_indices[:4096]

# Get the 10,000 random samples
train_dataset = dataset["train"].select(selected_indices)

Found cached dataset parquet (C:/Users/juan_/.cache/huggingface/datasets/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Prepare data
prompts = [item['prompt'] for item in train_dataset]
chosen = [item['chosen'] for item in train_dataset]
rejected = [item['rejected'] for item in train_dataset]
# Tokenization
max_length = 512  # Choose a max_length that fits your data
encodings = tokenizer(prompts, chosen, rejected, truncation=True, padding='max_length', max_length=max_length)


In [None]:
chosen_input_ids = []
rejected_input_ids = []
chosen_attention_mask = []
rejected_attention_mask = []


for i, prompt  in enumerate(prompts):
    chosen_input_ids.append(tokenizer(prompts[i], chosen[i], truncation=True, padding='max_length', max_length=max_length)['input_ids'])
    chosen_attention_mask.append(tokenizer(prompts[i], chosen[i], truncation=True, padding='max_length', max_length=max_length)['attention_mask'])
    
    # Assuming answer2 is the rejected answer when answer1 is chosen
    rejected_input_ids.append(tokenizer(prompts[i], rejected[i], truncation=True, padding='max_length', max_length=max_length)['input_ids'])
    rejected_attention_mask.append(tokenizer(prompts[i], rejected[i], truncation=True, padding='max_length', max_length=max_length)['attention_mask'])
        
chosen_input_ids = torch.tensor(chosen_input_ids).to(device)
rejected_input_ids = torch.tensor(rejected_input_ids).to(device)
chosen_attention_mask = torch.tensor(chosen_attention_mask).to(device)
rejected_attention_mask = torch.tensor(rejected_attention_mask).to(device)

dataset = TensorDataset(chosen_input_ids, chosen_attention_mask, rejected_input_ids, rejected_attention_mask)
loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [7]:
# Training setup
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    for i, batch in enumerate(loader):
        
        chosen_input_ids, chosen_attention_mask, rejected_input_ids, rejected_attention_mask = batch
        optimizer.zero_grad()
        
       # Forward pass for the "chosen" samples
        rewards_chosen = model(input_ids=chosen_input_ids, attention_mask=chosen_attention_mask)[0]
        
        # Forward pass for the "rejected" samples
        rewards_rejected = model(input_ids=rejected_input_ids, attention_mask=rejected_attention_mask)[0]
        
        # Compute the custom loss
        loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
        
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch+1}/{epochs}, Batch {i+1}/{len(loader)}, Loss: {loss.item()}")
    


Epoch 1/3, Batch 1/256, Loss: 1.344031810760498
Epoch 1/3, Batch 2/256, Loss: 0.7478545904159546
Epoch 1/3, Batch 3/256, Loss: 0.8257496356964111
Epoch 1/3, Batch 4/256, Loss: 1.3529607057571411
Epoch 1/3, Batch 5/256, Loss: 0.7173876762390137
Epoch 1/3, Batch 6/256, Loss: 0.9138973355293274
Epoch 1/3, Batch 7/256, Loss: 0.8186944723129272
Epoch 1/3, Batch 8/256, Loss: 0.9232765436172485
Epoch 1/3, Batch 9/256, Loss: 0.7672538757324219
Epoch 1/3, Batch 10/256, Loss: 0.6503831148147583
Epoch 1/3, Batch 11/256, Loss: 2.0072288513183594
Epoch 1/3, Batch 12/256, Loss: 0.7991471290588379
Epoch 1/3, Batch 13/256, Loss: 1.067011833190918
Epoch 1/3, Batch 14/256, Loss: 0.7656663656234741
Epoch 1/3, Batch 15/256, Loss: 0.9668176174163818
Epoch 1/3, Batch 16/256, Loss: 0.8232588768005371
Epoch 1/3, Batch 17/256, Loss: 0.6634282469749451
Epoch 1/3, Batch 18/256, Loss: 0.7625343799591064
Epoch 1/3, Batch 19/256, Loss: 0.6812299489974976
Epoch 1/3, Batch 20/256, Loss: 0.6635460257530212
Epoch 1/3, 

In [8]:
# Save the model to a directory
save_directory = "model_rewards_hf_yitingxie"
model.save_pretrained(save_directory)

# Optionally, save the tokenizer as well, especially if you've added special tokens or made other changes
tokenizer.save_pretrained(save_directory)


('model_rewards_hf_yitingxie\\tokenizer_config.json',
 'model_rewards_hf_yitingxie\\special_tokens_map.json',
 'model_rewards_hf_yitingxie\\vocab.json',
 'model_rewards_hf_yitingxie\\merges.txt',
 'model_rewards_hf_yitingxie\\added_tokens.json')

In [11]:
import torch.nn.functional as F

def calc_reward(model, tokenizer, prompt, answer1, answer2):
    # Tokenize the input
    inputs = tokenizer(prompt, [answer1, answer2], return_tensors='pt', padding=True, truncation=True, max_length=100)
    
    model.to(device)
    inputs.to(device)

    # Get model output
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Calculate probabilities
    probs = F.softmax(logits, dim=-1)

    # Interpret the result 
    if probs[0, 0] > probs[0, 1]:
        print(f"The model prefers '{answer1}' with a probability of {probs[0, 0]:.4f}")
    else:
        print(f"The model prefers '{answer2}' with a probability of {probs[0, 1]:.4f}")
        
    return logits

In [12]:
# Test the function
prompt = "What are the latest developments in artificial intelligence?"
answer1 = "GANs are revolutionizing image creation, and NLP models like GPT-3 are transforming language tasks."
answer2 = "AI is making strides in healthcare for diagnosis, and reinforcement learning is advancing robotics."

logits = calc_reward(model, tokenizer, prompt, answer1, answer2)
print(logits)

The model prefers 'AI is making strides in healthcare for diagnosis, and reinforcement learning is advancing robotics.' with a probability of 0.9999
tensor([[-1.4514,  7.4809]], device='cuda:0')


In [13]:
prompt = "What is the current state of the economy?"
answer1 = "I'm seeing some of the data back on here, about how much we need to increase our business expenditures. In a recent report, the Congressional Budget Office's Bureau of Economic Analysis estimated that the"
answer2 = "And how has your government done that?\n\nLudwig von Mises\n\nFrom the top down, the economy has become much better than it has been in the past several years."

logits = calc_reward(model, tokenizer, prompt, answer1, answer2)
print(logits)

The model prefers 'And how has your government done that?

Ludwig von Mises

From the top down, the economy has become much better than it has been in the past several years.' with a probability of 0.9999
tensor([[-1.2247,  8.0553]], device='cuda:0')
