# Inference

In [1]:
from transformers import AutoTokenizer, pipeline
import torch

rm_tokenizer = AutoTokenizer.from_pretrained("sfairXC/FsfairX-LLaMA3-RM-v0.1")
device = 0

rm_pipe = pipeline(
  "sentiment-analysis",
  model="./data/FsfairX-LLaMA3-RM-v0.1",
  device=device,
  tokenizer=rm_tokenizer,
  model_kwargs={"torch_dtype": torch.bfloat16}
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
rm_pipe.model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 4096, padding_idx=128256)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (nor

(score): Linear(in_features=4096, out_features=1, bias=False) 회귀층

In [13]:

pipe_kwargs = {
  "return_all_scores": True,
  "function_to_apply": "none",
  "batch_size": 1
}

chat = [
{"role": "user", "content": "Hello, how are you?"},
{"role": "assistant", "content": "I'm fine. Thank you, and you?"},
]

chat2 = [
{"role": "user", "content": "Hello, how are you?"},
{"role": "assistant", "content": "I am a murder"},
]


test_texts = [rm_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False).replace(rm_tokenizer.bos_token, "")]
pipe_outputs = rm_pipe(test_texts, **pipe_kwargs)
rewards = [output[0]["score"] for output in pipe_outputs]
print(rewards)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[-5.875]


[-5.875]

In [14]:
reword(chat2)

[-10.875]


[-10.875]

# Train 

https://github.com/RLHFlow/RLHF-Reward-Modeling/blob/main/bradley-terry-rm/llama3_rm.py

In [4]:
import os

# 환경 변수 설정
os.environ['HF_DATASETS_CACHE'] = '/root/easymaker/data/dataset'

# 환경 변수가 잘 설정되었는지 확인
print(os.environ['HF_DATASETS_CACHE'])


/root/easymaker/data/dataset


In [5]:
########################
# This script is modified from the TRL package https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py
# This script is designed for the reward modeling with Mistral model which should be handled carefully because it does not have an official pad token
# If you have any question, feel free to send me an email via wx13@illinois.edu
########################
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

# import evaluate
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset
# from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
)
from transformers.utils import PaddingStrategy




# Define and parse arguments.
@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """
    local_rank: Optional[int] = field(
        default=-1, metadata={"help": "Used for multi-gpu"})

    deepspeed: Optional[str] = field(
        default=None,
        metadata={"help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."},
    )
    
    per_device_train_batch_size: Optional[int] = field(default=1)
    per_device_eval_batch_size: Optional[int] = field(default=1)
    
    # for 8 GPU, the global batch size is 512
    gradient_accumulation_steps: Optional[int] = field(default=64)
    learning_rate: Optional[float] = field(default=2e-6)
    weight_decay: Optional[float] = field(default=0.001)
    
    model_name: Optional[str] = field(
        default="meta-llama/Meta-Llama-3-8B-Instruct",
        metadata={"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."},
    )
    bf16: Optional[bool] = field(
        default=True,
        metadata={"help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."},
    )
    num_train_epochs: Optional[int] = field(
        default=1,
        metadata={"help": "The number of training epochs for the reward model."},
    )
    train_set_path: Optional[str] = field(
        default="hendrydong/preference_700K",
        metadata={"help": "The dir of the subset of the training data to use"},
    )
    eval_set_path: Optional[str] = field(
        default="hendrydong/preference_700K",
        metadata={"help": "The dir of the subset of the eval data to use"},
    )
    output_path: Optional[str] = field(
        default="./models/llama3_rm_test",
        metadata={"help": "The dir for output model"},
    )
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    optim: Optional[str] = field(
        # default="adamw_hf",
        default="paged_adamw_32bit",
        # default="adamw_torch_fused",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: Optional[str] = field(
        default="cosine",
        metadata={"help": "The lr scheduler"},
    )
    
    max_length: Optional[int] = field(default=4096)

    save_every_steps: Optional[int] = field(
        default=999999,
        metadata={"help": "Save the model every x steps"},
    )
    eval_every_steps: Optional[int] = field(
        default=999999,
        metadata={"help": "Eval the model every x steps"},
    )


parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses(args=[
    '--model_name','./data/LLaMA-Factory/Meta-Llama-3.1-8B-Instruct',
    '--train_set_path', 'hendrydong/preference_700K',
    '--max_length', '8192',
    '--output_path','./data/models/llama3_rm_test',
    '--eval_every_steps','100'
])[0]

script_args

ScriptArguments(local_rank=-1, deepspeed=None, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=64, learning_rate=2e-06, weight_decay=0.001, model_name='./data/LLaMA-Factory/Meta-Llama-3.1-8B-Instruct', bf16=True, num_train_epochs=1, train_set_path='hendrydong/preference_700K', eval_set_path='hendrydong/preference_700K', output_path='./data/models/llama3_rm_test', gradient_checkpointing=True, optim='paged_adamw_32bit', lr_scheduler_type='cosine', max_length=8192, save_every_steps=999999, eval_every_steps=100)

In [6]:

# Load the value-head model and tokenizer.
tokenizer_name = script_args.model_name
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast = False)

# Adjusted according to the base model
# Need to do this for the models that don't have an official pad token.
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(tokenizer.padding_side)
tokenizer.truncation_side = "left"
tokenizer.model_max_length = script_args.max_length
# tokenizer.padding_side = "right"



# Get the dataset
train_path = script_args.train_set_path
eval_path = script_args.eval_set_path
output_name = script_args.output_path


def build_dataset(tokenizer, train_path, eval_path):

    def tokenize(sample):
        sample['positive'] = tokenizer.apply_chat_template(sample['chosen'], tokenize=False, add_generation_prompt=False).replace(tokenizer.bos_token, "")
        sample['negative'] = tokenizer.apply_chat_template(sample['rejected'], tokenize=False, add_generation_prompt=False).replace(tokenizer.bos_token, "")
        
        tokenized_pos = tokenizer(sample['positive'], truncation=True)
        tokenized_neg = tokenizer(sample['negative'], truncation=True)
        
        sample["input_ids_j"] = tokenized_pos["input_ids"]
        sample["attention_mask_j"] = tokenized_pos["attention_mask"]
        sample["input_ids_k"] = tokenized_neg["input_ids"]
        sample["attention_mask_k"] = tokenized_neg["attention_mask"]
        
        return sample

    ds = load_dataset(train_path, split="train").shuffle(seed=42) # 로드 
    
    # df = pd.read_csv(train_path)  # CSV 파일 경로
    # ds = Dataset.from_pandas(df)  # pandas DataFrame을 Dataset으로 변환
    # ds = ds.shuffle(seed=42)  # 데이터 셔플
    
    # ds = ds.select(range(2000))
    ds = ds.map(tokenize, num_proc=8)

    eval_dataset = None

    train_dataset = ds
    #eval_dataset = load_dataset(eval_path, split="train").shuffle(seed=42).select(range(500))
    eval_dataset = ds.select(range(500))
    return train_dataset, eval_dataset


train_dataset, eval_dataset = build_dataset(tokenizer, train_path, eval_path)
print("Training set: ", len(train_dataset), " Eval set: ", len(eval_dataset))

right


Downloading readme:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/217M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/218M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/218M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/322M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/700000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/700000 [00:00<?, ? examples/s]

Training set:  700000  Eval set:  500


In [7]:
train_dataset[6]

{'rejected': [{'content': 'Solve the following math problem step-by-step.\nSimplify your answer as much as possible. Present your final answer as \\boxed{Your Answer}.\nthe perimeters of 2 squares are 60 cm and 48 cm . find the perimeter of a third square whose area is equal to the difference of the areas of the two squares ?',
   'role': 'user'},
  {'content': "Step 1: First, we need to calculate the area of the two squares. The area of a square is calculated by squaring its side length.\n\n\n\nAnswer:\n\\boxed{Let's start.\n\n1. Area of the first square: side1 = √(perimeter1/4) = √(60/4) = √15.\n2. Area of the second square: side2 = √(perimeter2/4) = √(48/4) = √12 = √(4*3) = 2√3.\n3. Difference in their areas: area_diff = side1^2 - side2^2 = 15 - 12 = 3.\n4. Side length of the new square: side3 = √(area_diff) = √3.\n5. Perimeter of the new square: perimeter3 = 4*side3 = 4√3.\n\nThe answer is <b><4√3 cm</b>.}",
   'role': 'assistant'}],
 'rejected_score': None,
 'chosen_score': None,


In [8]:
# Define the trainer
training_args = TrainingArguments(
    output_dir=output_name,
    learning_rate=script_args.learning_rate,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    num_train_epochs=script_args.num_train_epochs,
    weight_decay=script_args.weight_decay,
    evaluation_strategy="steps",
    eval_steps=script_args.eval_every_steps,
    save_strategy="steps",
    save_steps=script_args.save_every_steps,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    gradient_checkpointing=script_args.gradient_checkpointing,
    deepspeed=script_args.deepspeed,
    local_rank=script_args.local_rank,
    remove_unused_columns=False,
    label_names=[],
    bf16=script_args.bf16,
    logging_strategy="steps",
    logging_steps=10,
    optim=script_args.optim,
    lr_scheduler_type=script_args.lr_scheduler_type,
    warmup_ratio=0.03,
    # report_to='wandb'
)

model = AutoModelForSequenceClassification.from_pretrained(script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16, use_flash_attention_2=True,)

model.config.use_cache = not script_args.gradient_checkpointing
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

num_proc = 24  # Can adjust to be higher if you have more processors.
original_columns = train_dataset.column_names

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./data/LLaMA-Factory/Meta-Llama-3.1-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:

# We need to define a special data collator that batches the data in our j vs k format.
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: AutoTokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        merged_features = []

        for feature in features:
            merged_features.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            merged_features.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch = self.tokenizer.pad(
            merged_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "return_loss": True,
        }
        return batch


# Define the trainer
def compute_metrics(eval_pred):
    result = {}
    pos_predictions_scores = eval_pred.predictions[0]
    neg_predictions_scores = eval_pred.predictions[1]
    
    # We assume that the first sample is preferred by default in groundtruth
    result['accuracy'] = np.sum(pos_predictions_scores > neg_predictions_scores) / len(pos_predictions_scores)
    return result


class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])[0]
        bsz = rewards.size(0)
        jidx = torch.arange(0, bsz, 2)
        kidx = jidx + 1
        rewards_j = rewards[jidx]
        rewards_k = rewards[kidx]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss


In [None]:
# Train the model, woohoo.
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=script_args.max_length),
)


trainer.train()


print("Saving last checkpoint of the model")
trainer.save_model(output_name + "/last_checkpoint")
tokenizer.save_pretrained(output_name + "/last_checkpoint")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
100,0.897,0.828445,0.55
200,0.704,0.673492,0.636


추론 속도 500개 30초정도