### File made for training the model with the OASST1 dataset

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer
from evaluate import load
import time

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# Model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load the tokenizer for Mistral
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,      # Add end-of-sequence token to the tokenizer
    use_fast=True,           # Use the fast tokenizer implementation
    padding_side='left'      # Pad sequences on the left side
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token


In [None]:
from peft import get_peft_model 
# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "float32")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto"                # Automatically map layers to devices
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# lora
# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA updates
    lora_dropout=0.05,         # Dropout rate applied to LoRA layers
    r=5,                      # Rank of the LoRA decomposition
    bias="none",               # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",     # Specify the task as causal language modeling
    target_modules=[           # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)



model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# print how much does the model occupy in memory
print(f"Model size: {sum(p.numel() for p in model.parameters())}")

In [None]:
dataset


In [None]:
from datasets import DatasetDict, Dataset
#  Paso 1: Crear la columna 'conversations' en el dataset
def create_conversations(split_dataset):
    # Crear un diccionario para acceder rápidamente a los mensajes por 'message_id'
    message_dict = {msg['message_id']: msg for msg in split_dataset}
    
    # Lista para almacenar las conversaciones
    conversations = []
    
    for msg in split_dataset:
        if msg['role'] == 'assistant':
            # Reconstruir la conversación desde el assistant hasta el prompter
            conversation = []
            current_msg = msg
            while current_msg:
                conversation.insert(0, current_msg['text'])
                parent_id = current_msg['parent_id']
                if parent_id and parent_id in message_dict:
                    current_msg = message_dict[parent_id]
                else:
                    current_msg = None
            conversations.append({'conversations': conversation})
        
    
    # Crear un nuevo dataset a partir de las conversaciones
    new_dataset = Dataset.from_list(conversations)
    return new_dataset

# Procesar cada split para crear el campo 'conversations'
new_train_dataset = create_conversations(dataset['train'])
new_validation_dataset = create_conversations(dataset['validation'])

# crear test
split_datasets = new_train_dataset.train_test_split(test_size=0.15, shuffle=True, seed=42)


# Crear un nuevo DatasetDict con los splits procesados
new_dataset = DatasetDict({
    'train': split_datasets['train'],   
    'validation': new_validation_dataset,
    'test': split_datasets['test']
})



In [None]:
# Paso 2: Tokenizar el dataset utilizando la función de tu profesor
def format_conversation(examples):
    # Unir las conversaciones en un solo string

    joined_conversations = ["\n".join(conv) if isinstance(conv, list) else conv for conv in examples['conversations']]
    print(joined_conversations[3])
    
    # Tokenizar las conversaciones unidas
    tokenized = tokenizer(
        joined_conversations,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    # Convertir tensores a listas para evitar problemas
    tokenized = {k: v.tolist() for k, v in tokenized.items()}
    return tokenized

# Aplicar la tokenización al dataset
tokenized_dataset = new_dataset.map(format_conversation, batched=True, remove_columns=["conversations"])
tokenized_dataset



In [None]:
# Paso 3: Configurar el formato del dataset para PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

len(tokenized_dataset["train"][0]["input_ids"])

In [None]:
# Paso 4: Definir los argumentos de entrenamiento
training_arguments = TrainingArguments(
    output_dir="./model_OASST1_2000",  # Directory for saving model checkpoints and logs
    eval_strategy="steps",                # Evaluation strategy: evaluate every few steps
    do_eval=True,                         # Enable evaluation during training
    optim="adamw_torch_4bit",             # Use 8-bit AdamW optimizer for memory efficiency
    per_device_train_batch_size=4,        # Batch size per device during training
    gradient_accumulation_steps=2,        # Accumulate gradients over multiple steps
    per_device_eval_batch_size=2,         # Batch size per device during evaluation
    log_level="debug",                    # Set logging level to debug for detailed logs
    logging_steps=10,                     # Log metrics every 10 steps
    learning_rate=0.0001796,                   # Initial learning rate
    eval_steps=500,                        # Evaluate the model every 25 steps
    max_steps=2000,                        # Total number of training steps (change as needed)
    save_steps=25,                        # Save checkpoints every 25 steps
    warmup_steps=25,                      # Number of warmup steps for learning rate scheduler
    lr_scheduler_type="linear",           # Use a linear learning rate scheduler
)



In [None]:
# Paso 5: Inicializar el SFTTrainer
trainer = SFTTrainer(
    model=model,                          # The pre-trained and prepared model
    train_dataset=tokenized_dataset['train'],  # Training dataset
    eval_dataset=tokenized_dataset['validation'],    # Evaluation dataset
    peft_config=lora_config,              # LoRA configuration for efficient fine-tuning
    max_seq_length=512,                   # Maximum sequence length for inputs
    tokenizer=tokenizer,                  # Tokenizer for encoding the data
    args=training_arguments,              # Training arguments defined earlier
)


In [None]:
# Paso 6: Iniciar el proceso de entrenamiento
trainer.train()