### Same functionality as OASST1_optuna1_param.ipynb, but different dataset

In [None]:
import torch
from datasets import load_dataset, DatasetDict, Dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer
import optuna

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("argilla/ifeval-like-data", "filtered")

# Model name
model_name = "model_OASST1_2000/checkpoint-2000"

# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,
    use_fast=True,
    padding_side='left'
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "bfloat16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=5,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# Print model size in a readable format
model_size = sum(p.numel() for p in model.parameters())
print(f"Model size: {model_size / 1e6:.2f} million parameters")



In [None]:
# Step 1: Create 'conversations' column in the dataset
def create_conversations(split_dataset):
    conversations = []
    for example in split_dataset:
        # print(example)  # Uncomment for debugging
        conversation = [example['prompt'], example['response']]
        conversations.append({'conversations': conversation})
    new_dataset = Dataset.from_list(conversations)
    return new_dataset

# Apply 'create_conversations' to both 'train' and 'validation' splits
# First, split the dataset correctly
split_dataset = dataset['train'].train_test_split(test_size=0.15, seed=42)
train_dataset = split_dataset['train']
valid_dataset = split_dataset['test']
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(valid_dataset)}")
# Create a new DatasetDict with processed splits
new_dataset = DatasetDict({
    'train': create_conversations(train_dataset),
    'validation': create_conversations(valid_dataset)
})


In [None]:

# Step 2: Tokenize the dataset
def format_conversation(examples):
    joined_conversations = ["\n".join(conv) if isinstance(conv, list) else conv for conv in examples['conversations']]
    tokenized = tokenizer(
        joined_conversations,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    # Return tensors directly for efficiency
    return tokenized

# Apply tokenization to the dataset
tokenized_dataset = new_dataset.map(
    format_conversation,
    batched=True,
    remove_columns=["conversations"]
)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Define the objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    
    # You can add more hyperparameters to tune here if desired

    # Define training arguments with sampled hyperparameters
    training_arguments = TrainingArguments(
        output_dir="./hasta_los_huevos",
        eval_strategy="steps",
        do_eval=True,
        optim="adamw_torch_4bit",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=2,
        log_level="info",  # Changed to 'info' to reduce verbosity
        logging_steps=10,
        learning_rate=learning_rate,
        eval_steps=25,
        max_steps=100,
        save_steps=25,
        warmup_steps=25,
        lr_scheduler_type="linear",
        report_to="none"  # Disable reporting to avoid clutter
    )
    
    # Initialize the SFTTrainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation'],
        peft_config=lora_config,
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    
    # Return the evaluation loss
    return eval_results['eval_loss']

# Create an Optuna study
study = optuna.create_study(direction='minimize')

# Run the optimization with a suitable number of trials
study.optimize(objective, n_trials=1)  # Increase n_trials as needed

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)
