### load dataset

In [1]:
from datasets import load_dataset

# # Load your custom dataset
# dataset = load_dataset('json', data_files='preprocessed_도서자료_기계독해.json')

# # Print dataset
# print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# huggingfcae dataset
dataset = load_dataset('vaiv/ko-rag-preference')

In [3]:
from transformers import AutoTokenizer


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('vaiv/llamion-14b-base')

def prompt_generator(context, instruction):
    system_message = "Specialized in Search, Summary, Answer as a Doctor-level expert who answers [to context]. The task is to 'read the context and answer the correct response to the instruction'. Ensure that the response is based on the context provided, and the input format is as follows: {context: 'instruction text', construction: 'instruction text'}\
    Format the response in the following format:\
    'assistant : assitant text'"
    prompt = f"\
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>\
    {system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\
    {{context:{context},instruction:{instruction}}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    return prompt

# Preprocess the dataset
def preprocess_function(examples):
    
    inputs = [prompt_generator(context, instruction) for context, instruction in zip(examples['context'], examples['instruction'])]
    model_inputs = tokenizer(inputs, truncation=True, padding='max_length', max_length=512)

    # Set up the labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['response'], truncation=True, padding='max_length', max_length=512)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset['train'].train_test_split(test_size=0.9)

# Print encoded dataset
print(encoded_dataset)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]


KeyError: 'instruction'

### setup

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, IA3Config, get_peft_model
from accelerate import Accelerator



# Load the model
model = AutoModelForCausalLM.from_pretrained('vaiv/llamion-14b-base', device_map="auto")

### accelerator

In [None]:
%env NCCL_P2P_DISABLE=1
%env NCCL_IB_DISABLE=1

In [None]:


# Split the datasets into training and testing sets
encoded_dataset_with_context = encoded_dataset['train'].train_test_split(test_size=0.2)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer for context with auto device
trainer_with_context = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_with_context['train'],
    eval_dataset=encoded_dataset_with_context['test'],
)



### evaluation for data selection

In [None]:
from transformers import Trainer
import numpy as np

def compute_loss(model, inputs, labels):
    outputs = model(**inputs, labels=labels)
    loss = outputs.loss
    return loss.item()

def custom_metric(dataset, model, tokenizer):
    context_losses = []
    no_context_losses = []
    
    for data in dataset:
        # Prepare inputs with context
        context_input = tokenizer(f"{data['context']} {data['question']}", return_tensors='pt', truncation=True, padding='max_length', max_length=512)
        context_label = tokenizer(data['chosen'], return_tensors='pt', truncation=True, padding='max_length', max_length=512)['input_ids']
        
        # Prepare inputs without context
        no_context_input = tokenizer(data['question'], return_tensors='pt', truncation=True, padding='max_length', max_length=512)
        no_context_label = context_label
        
        # Calculate losses
        context_loss = compute_loss(model, context_input, context_label)
        no_context_loss = compute_loss(model, no_context_input, no_context_label)
        
        context_losses.append(context_loss)
        no_context_losses.append(no_context_loss)
    
    # Calculate metric
    metric_scores = [c / nc for c, nc in zip(context_losses, no_context_losses)]
    return np.mean(metric_scores), metric_scores

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('vaiv/llamion-14b-base')

# Calculate the custom metric on the test set
mean_score, scores = custom_metric(encoded_dataset['test'], model, tokenizer)



In [None]:
print("Mean custom metric score:", mean_score)
for i, score in enumerate(scores):
    print(f"Data point {i}: {score}")