### Optuna implementation for OASST1 with 3 parameters optimization

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer
from datasets import DatasetDict, Dataset
import optuna

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# Model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load the tokenizer for Mistral
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,      # Add end-of-sequence token to the tokenizer
    use_fast=True,           # Use the fast tokenizer implementation
    padding_side='left'      # Pad sequences on the left side
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

from peft import get_peft_model

# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "bfloat16")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto"                # Automatically map layers to devices
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA updates
    lora_dropout=0.05,         # Dropout rate applied to LoRA layers
    r=5,                       # Rank of the LoRA decomposition
    bias="none",               # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",     # Specify the task as causal language modeling
    target_modules=[           # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# Print model size
print(f"Model size: {sum(p.numel() for p in model.parameters())}")

# Step 1: Create 'conversations' column in the dataset
def create_conversations(split_dataset):
    message_dict = {msg['message_id']: msg for msg in split_dataset}
    conversations = []

    for msg in split_dataset:
        if msg['role'] == 'assistant':
            conversation = []
            current_msg = msg
            while current_msg:
                conversation.insert(0, current_msg['text'])
                parent_id = current_msg['parent_id']
                if parent_id and parent_id in message_dict:
                    current_msg = message_dict[parent_id]
                else:
                    current_msg = None
            conversations.append({'conversations': conversation})

    new_dataset = Dataset.from_list(conversations)
    return new_dataset

# Process each split to create 'conversations' field
new_train_dataset = create_conversations(dataset['train'])
new_validation_dataset = create_conversations(dataset['validation'])

# Create test split
split_datasets = new_train_dataset.train_test_split(test_size=0.15, shuffle=True, seed=42)

# Create a new DatasetDict with processed splits
new_dataset = DatasetDict({
    'train': split_datasets['train'],
    'validation': new_validation_dataset,
    'test': split_datasets['test']
})

# Step 2: Tokenize the dataset
def format_conversation(examples):
    joined_conversations = ["\n".join(conv) if isinstance(conv, list) else conv for conv in examples['conversations']]
    tokenized = tokenizer(
        joined_conversations,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    tokenized = {k: v.tolist() for k, v in tokenized.items()}
    return tokenized

# Apply tokenization to the dataset
tokenized_dataset = new_dataset.map(format_conversation, batched=True, remove_columns=["conversations"])
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Define the objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [1, 2, 4])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4])
    
    # Step 4: Define training arguments with sampled hyperparameters
    training_arguments = TrainingArguments(
        output_dir="./optuna_22",
        eval_strategy="steps",
        do_eval=True,
        optim="adamw_torch_4bit",
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_eval_batch_size=2,
        log_level="debug",
        logging_steps=10,
        learning_rate=learning_rate,
        eval_steps=25,
        max_steps=100,
        save_steps=25,
        warmup_steps=25,
        lr_scheduler_type="linear",
    )
    
    # Step 5: Initialize the SFTTrainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation'],
        peft_config=lora_config,
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    
    # Return the evaluation loss
    return eval_results['eval_loss']

# Create an Optuna study
study = optuna.create_study(direction='minimize')

# Run the optimization with only one trial
study.optimize(objective, n_trials=1)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)
