### Continuation of the previous training with ifeval_dataset

In [8]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer
from evaluate import load
import time

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("argilla/ifeval-like-data", "filtered")

# Model name
model_name = "goat_continue/checkpoint-1500"

# Load the tokenizer for Mistral
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,      # Add end-of-sequence token to the tokenizer
    use_fast=True,           # Use the fast tokenizer implementation
    padding_side='left'      # Pad sequences on the left side
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token


In [9]:
from peft import get_peft_model 
# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "bfloat16")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto"                # Automatically map layers to devices
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# lora
# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA updates
    lora_dropout=0.05,         # Dropout rate applied to LoRA layers
    r=5,                      # Rank of the LoRA decomposition
    bias="none",               # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",     # Specify the task as causal language modeling
    target_modules=[           # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)



model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# print how much does the model occupy in memory
print(f"Model size: {sum(p.numel() for p in model.parameters())}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model size: 3771469824


In [10]:
dataset


DatasetDict({
    train: Dataset({
        features: ['key', 'prompt', 'response', 'instruction_id_list', 'kwargs', 'prompt_level_strict_acc', 'inst_level_strict_acc', 'prompt_level_loose_acc', 'inst_level_loose_acc'],
        num_rows: 56339
    })
})

In [11]:
from datasets import DatasetDict, Dataset
#  Paso 1: Crear la columna 'conversations' en el dataset
def create_conversations(split_dataset):
    conversations = []
    for example in split_dataset:
        # print(example)  # Uncomment for debugging
        conversation = [example['prompt'], example['response']]
        conversations.append({'conversations': conversation})
    new_dataset = Dataset.from_list(conversations)
    return new_dataset

split_dataset = dataset['train'].train_test_split(test_size=0.15, seed=42)
train_dataset = split_dataset['train']
test_set = split_dataset['test']
# divide el valid en 75 y 25
split_train = train_dataset.train_test_split(test_size=0.10, seed=42)
train_dataset = split_train['train']
valid_dataset = split_train['test']

new_dataset = DatasetDict({
    'train': create_conversations(train_dataset),   
    'validation': create_conversations(valid_dataset),
})


In [12]:
new_dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 43099
    })
    validation: Dataset({
        features: ['conversations'],
        num_rows: 4789
    })
})

In [13]:
def format_conversation(examples):
    # Unir las conversaciones en un solo string

    joined_conversations = ["\n".join(conv) if isinstance(conv, list) else conv for conv in examples['conversations']]
    
    # Tokenizar las conversaciones unidas
    tokenized = tokenizer(
        joined_conversations,
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    # Convertir tensores a listas para evitar problemas
    tokenized = {k: v.tolist() for k, v in tokenized.items()}
    return tokenized

# Aplicar la tokenización al dataset
tokenized_dataset = new_dataset.map(format_conversation, batched=True, remove_columns=["conversations"])
tokenized_dataset



Map:   0%|          | 0/43099 [00:00<?, ? examples/s]

Map:   0%|          | 0/4789 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 43099
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4789
    })
})

In [14]:
# Paso 3: Configurar el formato del dataset para PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

len(tokenized_dataset["train"][0]["input_ids"])

512

In [15]:
# Paso 4: Definir los argumentos de entrenamiento
training_arguments = TrainingArguments(
    output_dir="./model_ifeval_like_dataset_1500",  # Directory for saving model checkpoints and logs
    eval_strategy="steps",                # Evaluation strategy: evaluate every few steps
    do_eval=True,                         # Enable evaluation during training
    optim="adamw_torch_4bit",             # Use 8-bit AdamW optimizer for memory efficiency
    per_device_train_batch_size=4,        # Batch size per device during training
    gradient_accumulation_steps=2,        # Accumulate gradients over multiple steps
    per_device_eval_batch_size=2,         # Batch size per device during evaluation
    log_level="debug",                    # Set logging level to debug for detailed logs
    logging_steps=10,                     # Log metrics every 10 steps
    learning_rate= 0.0003104711027074527,                   # Initial learning rate
    eval_steps=250,                        # Evaluate the model every 25 steps
    max_steps=1500,                        # Total number of training steps (change as needed)
    save_steps=25,                        # Save checkpoints every 25 steps
    warmup_steps=25,                      # Number of warmup steps for learning rate scheduler
    lr_scheduler_type="linear",           # Use a linear learning rate scheduler
)



In [16]:
# Paso 5: Inicializar el SFTTrainer
trainer = SFTTrainer(
    model=model,                          # The pre-trained and prepared model
    train_dataset=tokenized_dataset['train'],  # Training dataset
    eval_dataset=tokenized_dataset['validation'],    # Evaluation dataset
    peft_config=lora_config,              # LoRA configuration for efficient fine-tuning
    max_seq_length=512,                   # Maximum sequence length for inputs
    tokenizer=tokenizer,                  # Tokenizer for encoding the data
    args=training_arguments,              # Training arguments defined earlier
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


In [17]:
# Paso 6: Iniciar el proceso de entrenamiento
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 43,099
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 1,500
  Number of trainable parameters = 13,107,200
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


  return fn(*args, **kwargs)


KeyboardInterrupt: 

In [None]:

accuracy_score = load("accuracy")

class PerformanceBenchmark:
    """
    A class to benchmark the performance of a model on a given dataset.
    
    Attributes:
    -----------
    model : transformers.PreTrainedModel
        The model to be benchmarked.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer associated with the model.
    dataset : datasets.Dataset
        The dataset on which the model's performance will be evaluated.
    """
    
    def __init__(self, model, tokenizer, dataset):
        """
        Initializes the PerformanceBenchmark with the provided model, tokenizer, and dataset.
        
        Parameters:
        -----------
        model : transformers.PreTrainedModel
            The model to be benchmarked.
        tokenizer : transformers.PreTrainedTokenizer
            The tokenizer for encoding the inputs for the model.
        dataset : datasets.Dataset
            The dataset on which the model's performance will be evaluated.
        """
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset

    def compute_parameters(self):
        """
        Computes the total number of parameters and the number of trainable parameters.
        
        Returns:
        --------
        dict :
            A dictionary containing:
            - `total_params`: The total number of parameters in the model.
            - `trainable_params`: The number of trainable parameters in the model.
        """
        total_params = sum(p.numel() for p in self.model.parameters())  # Total parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # Trainable parameters
        
        return {
            "total_params": total_params,
            "trainable_params": trainable_params
        }

    def compute_size(self):
        """
        Computes the size of the model in terms of the number of parameters 
        and memory usage in megabytes (MB).

        Returns:
        --------
        dict :
            A dictionary containing the number of parameters (`num_params`) and 
            the model size in MB (`model_size_mb`).
        """
        num_params = sum(p.numel() for p in self.model.parameters())
        model_size_mb = sum(p.element_size() * p.nelement() for p in self.model.parameters()) / (1024**2)
        
        return {"num_params": num_params, "model_size_mb": model_size_mb}

    def time_pipeline(self):
        """
        Measures the total time and average time taken by the model to process 
        the dataset.
        
        This method will use the tokenizer to encode the inputs before passing them 
        to the model.

        Returns:
        --------
        dict :
            A dictionary containing the total processing time in seconds (`total_time_sec`) 
            and the average time per example (`avg_time_per_example_sec`).
        """
        start_time = time.time()
        
        for example in self.dataset:
            inputs = example['conversations']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
        
        end_time = time.time()
        total_time = end_time - start_time
        avg_time_per_example = total_time / len(self.dataset) if len(self.dataset) > 0 else float('inf')
        
        return {"total_time_sec": total_time, "avg_time_per_example_sec": avg_time_per_example}

    def compute_latency(self):
        """
        Computes the average latency of the model, defined as the time taken 
        to process a single example from the dataset.

        Returns:
        --------
        dict :
            A dictionary containing the average latency in seconds (`avg_latency_sec`).
        """
        latencies = []
        
        for example in self.dataset:
            inputs = example['conversations']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            
            start_time = time.time()
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
            end_time = time.time()
            
            latencies.append(end_time - start_time)
        
        avg_latency = sum(latencies) / len(latencies) if len(latencies) > 0 else float('inf')
        return {"avg_latency_sec": avg_latency}

    def compute_throughput(self):
        """
        Computes the throughput of the model, defined as the number of examples 
        processed per second.

        Returns:
        --------
        dict :
            A dictionary containing the throughput in examples per second (`throughput_examples_per_sec`).
        """
        start_time = time.time()
        
        for example in self.dataset:
            inputs = example['conversations']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
        
        end_time = time.time()
        total_time = end_time - start_time
        throughput = len(self.dataset) / total_time if total_time > 0 else 0
        
        return {"throughput_examples_per_sec": throughput}
    

    def run_benchmark(self):
        """
        Runs all the benchmark metrics (size, time, latency, throughput, and FLOPs) 
        and returns the results.

        Returns:
        --------
        dict :
            A dictionary containing all the computed metrics for the model. 
            Includes size, parameters, time, latency, throughput, and FLOPs estimates.
        """
        metrics = {}
        metrics['Size'] = self.compute_size()
        metrics['Parameters'] = self.compute_parameters()
        metrics['Time'] = self.time_pipeline()
        metrics['Latency'] = self.compute_latency()
        metrics['Throughput'] = self.compute_throughput()
        return metrics
    
# Instantiate the PerformanceBenchmark class with the model, tokenizer, and test dataset
benchmark = PerformanceBenchmark(model, tokenizer, new_dataset['test'])

# Run the benchmark to compute performance metrics
results = benchmark.run_benchmark()

# Display the benchmark results
print(results)