In [None]:
# !pip install optuna

Defaulting to user installation because normal site-packages is not writeable
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
Downloading SQLAlchemy-2.0.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer
from datasets import DatasetDict, Dataset
import optuna

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# Model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load the tokenizer for Mistral
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,      # Add end-of-sequence token to the tokenizer
    use_fast=True,           # Use the fast tokenizer implementation
    padding_side='left'      # Pad sequences on the left side
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token

from peft import get_peft_model

# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "bfloat16")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto"                # Automatically map layers to devices
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA updates
    lora_dropout=0.05,         # Dropout rate applied to LoRA layers
    r=5,                       # Rank of the LoRA decomposition
    bias="none",               # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",     # Specify the task as causal language modeling
    target_modules=[           # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# Print model size
print(f"Model size: {sum(p.numel() for p in model.parameters())}")

# Step 1: Create 'conversations' column in the dataset
def create_conversations(split_dataset):
    message_dict = {msg['message_id']: msg for msg in split_dataset}
    conversations = []

    for msg in split_dataset:
        if msg['role'] == 'assistant':
            conversation = []
            current_msg = msg
            while current_msg:
                conversation.insert(0, current_msg['text'])
                parent_id = current_msg['parent_id']
                if parent_id and parent_id in message_dict:
                    current_msg = message_dict[parent_id]
                else:
                    current_msg = None
            conversations.append({'conversations': conversation})

    new_dataset = Dataset.from_list(conversations)
    return new_dataset

# Process each split to create 'conversations' field
new_train_dataset = create_conversations(dataset['train'])
new_validation_dataset = create_conversations(dataset['validation'])

# Create test split
split_datasets = new_train_dataset.train_test_split(test_size=0.15, shuffle=True, seed=42)

# Create a new DatasetDict with processed splits
new_dataset = DatasetDict({
    'train': split_datasets['train'],
    'validation': new_validation_dataset,
    'test': split_datasets['test']
})

# Step 2: Tokenize the dataset
def format_conversation(examples):
    joined_conversations = ["\n".join(conv) if isinstance(conv, list) else conv for conv in examples['conversations']]
    tokenized = tokenizer(
        joined_conversations,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    tokenized = {k: v.tolist() for k, v in tokenized.items()}
    return tokenized

# Apply tokenization to the dataset
tokenized_dataset = new_dataset.map(format_conversation, batched=True, remove_columns=["conversations"])
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Define the objective function for Optuna
def objective(trial):
    # Sample hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [1, 2, 4])
    gradient_accumulation_steps = trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4])
    
    # Step 4: Define training arguments with sampled hyperparameters
    training_arguments = TrainingArguments(
        output_dir="./optuna_22",
        eval_strategy="steps",
        do_eval=True,
        optim="adamw_torch_4bit",
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_eval_batch_size=2,
        log_level="debug",
        logging_steps=10,
        learning_rate=learning_rate,
        eval_steps=25,
        max_steps=100,
        save_steps=25,
        warmup_steps=25,
        lr_scheduler_type="linear",
    )
    
    # Step 5: Initialize the SFTTrainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['validation'],
        peft_config=lora_config,
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    
    # Return the evaluation loss
    return eval_results['eval_loss']

# Create an Optuna study
study = optuna.create_study(direction='minimize')

# Run the optimization with only one trial
study.optimize(objective, n_trials=1)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)


2024-11-20 09:02:17.395949: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 09:02:17.403440: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-20 09:02:17.414845: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-20 09:02:17.417695: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-20 09:02:17.424832: I tensorflow/core/platform/cpu_feature_guar

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model size: 3771469824


Map:   0%|          | 0/44975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2756 [00:00<?, ? examples/s]

Map:   0%|          | 0/7937 [00:00<?, ? examples/s]

[I 2024-11-20 09:08:41,642] A new study created in memory with name: no-name-6a4af822-8148-4089-b80b-4b69657b1991

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs
Currently training with a batch size of: 2
***** Running training *****
  Num examples = 44,975
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 100
  Number of trainable parameters = 13,107,200
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
25,1.4893,1.514197
50,1.3697,1.474586
75,1.3922,1.45577
100,1.3874,1.44978


  return fn(*args, **kwargs)

***** Running Evaluation *****
  Num examples = 2756
  Batch size = 2
Saving model checkpoint to ./optuna_22/checkpoint-25
loading configuration file config.json from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vocab_size": 32768
}

tokenizer config

[I 2024-11-20 11:12:26,015] Trial 0 finished with value: 1.4497803449630737 and parameters: {'learning_rate': 2.0087615229785912e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}. Best is trial 0 with value: 1.4497803449630737.


Best hyperparameters:  {'learning_rate': 2.0087615229785912e-05, 'per_device_train_batch_size': 2, 'gradient_accumulation_steps': 4}
