## Mistral 7B

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments
)
from trl import SFTTrainer
from evaluate import load
import time

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# Model name
model_name = "mistralai/Mistral-7B-v0.3"

# Load the tokenizer for Mistral
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,      # Add end-of-sequence token to the tokenizer
    use_fast=True,           # Use the fast tokenizer implementation
    padding_side='left'      # Pad sequences on the left side
)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS token


2024-11-18 16:17:34.635016: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-18 16:17:34.642416: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-18 16:17:34.650394: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-18 16:17:34.652895: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 16:17:34.659614: I tensorflow/core/platform/cpu_feature_guar

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [2]:
from peft import get_peft_model 
# Quantization configuration using bitsandbytes library
compute_dtype = getattr(torch, "bfloat16")  # Set computation data type to bfloat16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                    # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",            # Specify quantization type as Normal Float 4
    bnb_4bit_compute_dtype=compute_dtype, # Set computation data type
    bnb_4bit_use_double_quant=True,       # Use double quantization for better accuracy
)

# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply quantization configuration
    device_map="auto"                # Automatically map layers to devices
)

# Prepare the model for k-bit (e.g., 4-bit) training
model = prepare_model_for_kbit_training(model)

# lora
# Low-Rank Adaptation (LoRA) configuration for efficient fine-tuning
lora_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA updates
    lora_dropout=0.05,         # Dropout rate applied to LoRA layers
    r=5,                      # Rank of the LoRA decomposition
    bias="none",               # No bias is added to the LoRA layers
    task_type="CAUSAL_LM",     # Specify the task as causal language modeling
    target_modules=[           # Modules to apply LoRA to
        'k_proj', 'q_proj', 'v_proj', 'o_proj',
        'gate_proj', 'down_proj', 'up_proj'
    ]
)

model = get_peft_model(model, lora_config)



model.config.pad_token_id = tokenizer.pad_token_id  # Set the model's padding token ID

# print how much does the model occupy in memory
print(f"Model size: {sum(p.numel() for p in model.parameters())}")

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model size: 3771469824


In [None]:
model = model.to(device)
original_model_size = model.get_memory_footprint() / (1024 ** 2)
print(f"Original model size: {original_model_size:.2f} MB")
print(f"Original model size: {original_model_size/1024:.2f} GB")

Original model size: 4403.02 MB
Original model size: 4.30 GB


In [3]:
dataset


DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})

In [5]:
from datasets import DatasetDict, Dataset

def create_prompt_assistant_columns_optimized(split_dataset):
    # Crear un diccionario para acceder rápidamente a los mensajes por 'message_id'
    message_dict = {msg['message_id']: msg for msg in split_dataset}
    
    # Diccionario para memoizar los prompts reconstruidos
    prompt_cache = {}
    
    def reconstruct_prompts(msg):
        if msg['message_id'] in prompt_cache:
            return prompt_cache[msg['message_id']]
        
        prompts = []
        current_msg = msg
        while current_msg:
            parent_id = current_msg['parent_id']
            if parent_id and parent_id in message_dict:
                parent_msg = message_dict[parent_id]
                if parent_msg['role'] == 'prompter':
                    prompts.insert(0, parent_msg['text'])
                current_msg = parent_msg
            else:
                break
        full_prompt = ' '.join(prompts)
        prompt_cache[msg['message_id']] = full_prompt
        return full_prompt
    
    # Lista para almacenar las parejas prompt-assistant
    prompt_assistant_pairs = []
    
    for msg in split_dataset:
        if msg['role'] == 'assistant':
            full_prompt = reconstruct_prompts(msg)
            prompt_assistant_pairs.append({
                'prompt': full_prompt,
                'assistant': msg['text']
            })
    
    # Crear un nuevo dataset a partir de las parejas prompt-assistant
    new_dataset = Dataset.from_list(prompt_assistant_pairs)
    return new_dataset

# Procesar cada split con la versión optimizada
new_train_dataset = create_prompt_assistant_columns_optimized(dataset['train'])
new_validation_dataset = create_prompt_assistant_columns_optimized(dataset['validation'])

# Crear el split de test a partir del train split modificado
split_datasets = new_train_dataset.train_test_split(test_size=0.15, shuffle=True, seed=42)

# Crear un nuevo DatasetDict con los splits procesados
new_dataset = DatasetDict({
    'train': split_datasets['train'],   
    'validation': new_validation_dataset,
    'test': split_datasets['test']
})

# Opcional: Verificar algunas entradas del nuevo dataset
print(new_dataset['train'][0])
print(new_dataset['validation'][0])
print(new_dataset['test'][0])



{'prompt': 'Who is Zheng Yu Dong from the SEA group?', 'assistant': 'It is difficult to determine exactly which Zheng Yu Dong and which SEA group you are referring to.\n\nHowever, having reviewed various news articles and websites, I believe you may be referring to the Zheng Yu Dong who lists himself as CEO of Maribank on LinkedIn[0]. This name also appears as CEO of Maribank on the website for the Monetary Authority of Singapore [1].\n\nIt appears that MariBank is owned by Sea Limited [2] (originally called Garena) [3]. I believe that Sea Limited is the Sea Group you are referring to.\n\nIt is not clear whether Zheng Yu Dong holds other positions within Sea Limited.\n\nDoes this answer your question?\n\n[0] https://www.linkedin.com/in/zhengyudong/?originalSubdomain=sg\n[1] https://eservices.mas.gov.sg/fid/institution/detail/248595-MARIBANK-SINGAPORE-PRIVATE-LIMITED\n[2] https://www.straitstimes.com/business/banking/sea-opens-digital-bank-maribank-to-public-on-invite-only-basis\n[3] ht

In [6]:
from datasets import DatasetDict, Dataset

# Paso 2: Tokenizar el dataset utilizando la función optimizada
def format_conversation(examples):
    # Tokenizar el 'prompt' como entrada
    tokenized_prompt = tokenizer(
        examples['prompt'],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Tokenizar el 'assistant' como etiquetas
    tokenized_assistant = tokenizer(
        examples['assistant'],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_tensors="pt"
    )
    
    # Convertir tensores a listas para evitar problemas
    tokenized_prompt = {k: v.tolist() for k, v in tokenized_prompt.items()}
    tokenized_assistant = {k: v.tolist() for k, v in tokenized_assistant.items()}
    
    # Asignar 'labels' desde 'assistant'
    tokenized_prompt['labels'] = tokenized_assistant['input_ids']
    
    return tokenized_prompt

# Aplicar la tokenización al dataset
tokenized_dataset = new_dataset.map(
    format_conversation, 
    batched=True, 
    remove_columns=["prompt", "assistant"]
)






Map:   0%|          | 0/44975 [00:00<?, ? examples/s]

Map:   0%|          | 0/2756 [00:00<?, ? examples/s]

Map:   0%|          | 0/7937 [00:00<?, ? examples/s]

In [7]:
# Paso 3: Configurar el formato del dataset para PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])



In [None]:
# Paso 4: Definir los argumentos de entrenamiento
training_arguments = TrainingArguments(
    output_dir="./results_3",  # Directory for saving model checkpoints and logs
    eval_strategy="steps",                # Evaluation strategy: evaluate every few steps
    do_eval=True,                         # Enable evaluation during training
    optim="paged_adamw_8bit",             # Use 8-bit AdamW optimizer for memory efficiency
    per_device_train_batch_size=4,        # Batch size per device during training
    gradient_accumulation_steps=2,        # Accumulate gradients over multiple steps
    per_device_eval_batch_size=2,         # Batch size per device during evaluation
    log_level="debug",                    # Set logging level to debug for detailed logs
    logging_steps=10,                     # Log metrics every 10 steps
    learning_rate=1e-4,                   # Initial learning rate
    eval_steps=25,                        # Evaluate the model every 25 steps
    max_steps=100,                        # Total number of training steps (epochs = max_steps // steps_per_epoch), steps_per_epoch = len(train_dataset) // batch_size
    save_steps=25,                        # Save checkpoints every 25 steps
    warmup_steps=25,                      # Number of warmup steps for learning rate scheduler
    lr_scheduler_type="linear",           # Use a linear learning rate scheduler
)

## cambiar batch size a 1




In [9]:
# Paso 5: Inicializar el SFTTrainer
trainer = SFTTrainer(
    model=model,                          # The pre-trained and prepared model
    train_dataset=tokenized_dataset['train'],  # Training dataset
    eval_dataset=tokenized_dataset['validation'],    # Evaluation dataset
    peft_config=lora_config,              # LoRA configuration for efficient fine-tuning
    max_seq_length=512,                   # Maximum sequence length for inputs
    tokenizer=tokenizer,                  # Tokenizer for encoding the data
    args=training_arguments,              # Training arguments defined earlier
)



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs


In [10]:
# Paso 6: Iniciar el proceso de entrenamiento
trainer.train()

Currently training with a batch size of: 4
***** Running training *****
  Num examples = 44,975
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 100
  Number of trainable parameters = 13,107,200
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
25,2.6743,2.541729
50,2.2208,2.383408
75,2.1727,2.341177
100,2.1933,2.329138



***** Running Evaluation *****
  Num examples = 2756
  Batch size = 2
Saving model checkpoint to ./results_3/checkpoint-25
loading configuration file config.json from cache at /home/usuario/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff/config.json
Model config MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vocab_size": 32768
}

tokenizer config file saved in ./results_3/ch

TrainOutput(global_step=100, training_loss=2.3050271606445314, metrics={'train_runtime': 6064.8928, 'train_samples_per_second': 0.132, 'train_steps_per_second': 0.016, 'total_flos': 1.75151014477824e+16, 'train_loss': 2.3050271606445314, 'epoch': 0.017787264318747775})

In [17]:

accuracy_score = load("accuracy")

class PerformanceBenchmark:
    """
    A class to benchmark the performance of a model on a given dataset.
    
    Attributes:
    -----------
    model : transformers.PreTrainedModel
        The model to be benchmarked.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer associated with the model.
    dataset : datasets.Dataset
        The dataset on which the model's performance will be evaluated.
    """
    
    def __init__(self, model, tokenizer, dataset):
        """
        Initializes the PerformanceBenchmark with the provided model, tokenizer, and dataset.
        
        Parameters:
        -----------
        model : transformers.PreTrainedModel
            The model to be benchmarked.
        tokenizer : transformers.PreTrainedTokenizer
            The tokenizer for encoding the inputs for the model.
        dataset : datasets.Dataset
            The dataset on which the model's performance will be evaluated.
        """
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset

    def compute_parameters(self):
        """
        Computes the total number of parameters and the number of trainable parameters.
        
        Returns:
        --------
        dict :
            A dictionary containing:
            - `total_params`: The total number of parameters in the model.
            - `trainable_params`: The number of trainable parameters in the model.
        """
        total_params = sum(p.numel() for p in self.model.parameters())  # Total parameters
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # Trainable parameters
        
        return {
            "total_params": total_params,
            "trainable_params": trainable_params
        }

    def compute_size(self):
        """
        Computes the size of the model in terms of the number of parameters 
        and memory usage in megabytes (MB).

        Returns:
        --------
        dict :
            A dictionary containing the number of parameters (`num_params`) and 
            the model size in MB (`model_size_mb`).
        """
        num_params = sum(p.numel() for p in self.model.parameters())
        model_size_mb = sum(p.element_size() * p.nelement() for p in self.model.parameters()) / (1024**2)
        
        return {"num_params": num_params, "model_size_mb": model_size_mb}

    def time_pipeline(self):
        """
        Measures the total time and average time taken by the model to process 
        the dataset.
        
        This method will use the tokenizer to encode the inputs before passing them 
        to the model.

        Returns:
        --------
        dict :
            A dictionary containing the total processing time in seconds (`total_time_sec`) 
            and the average time per example (`avg_time_per_example_sec`).
        """
        start_time = time.time()
        
        for example in self.dataset:
            inputs = example['prompt']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
        
        end_time = time.time()
        total_time = end_time - start_time
        avg_time_per_example = total_time / len(self.dataset) if len(self.dataset) > 0 else float('inf')
        
        return {"total_time_sec": total_time, "avg_time_per_example_sec": avg_time_per_example}

    def compute_latency(self):
        """
        Computes the average latency of the model, defined as the time taken 
        to process a single example from the dataset.

        Returns:
        --------
        dict :
            A dictionary containing the average latency in seconds (`avg_latency_sec`).
        """
        latencies = []
        
        for example in self.dataset:
            inputs = example['prompt']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            
            start_time = time.time()
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
            end_time = time.time()
            
            latencies.append(end_time - start_time)
        
        avg_latency = sum(latencies) / len(latencies) if len(latencies) > 0 else float('inf')
        return {"avg_latency_sec": avg_latency}

    def compute_throughput(self):
        """
        Computes the throughput of the model, defined as the number of examples 
        processed per second.

        Returns:
        --------
        dict :
            A dictionary containing the throughput in examples per second (`throughput_examples_per_sec`).
        """
        start_time = time.time()
        
        for example in self.dataset:
            inputs = example['prompt']
            # Tokenize the input
            tokenized_input = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
            _ = self.model.generate(**tokenized_input, max_new_tokens=10)
        
        end_time = time.time()
        total_time = end_time - start_time
        throughput = len(self.dataset) / total_time if total_time > 0 else 0
        
        return {"throughput_examples_per_sec": throughput}
    

    def run_benchmark(self):
        """
        Runs all the benchmark metrics (size, time, latency, throughput, and FLOPs) 
        and returns the results.

        Returns:
        --------
        dict :
            A dictionary containing all the computed metrics for the model. 
            Includes size, parameters, time, latency, throughput, and FLOPs estimates.
        """
        metrics = {}
        metrics['Size'] = self.compute_size()
        metrics['Parameters'] = self.compute_parameters()
        metrics['Time'] = self.time_pipeline()
        metrics['Latency'] = self.compute_latency()
        metrics['Throughput'] = self.compute_throughput()
        return metrics
    
# Instantiate the PerformanceBenchmark class with the model, tokenizer, and test dataset
benchmark = PerformanceBenchmark(model, tokenizer, new_dataset['test'])

# Run the benchmark to compute performance metrics
results = benchmark.run_benchmark()

# Display the benchmark results
print(results)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

{'Size': {'num_params': 3771469824, 'model_size_mb': 4403.015625}, 'Parameters': {'total_params': 3771469824, 'trainable_params': 13107200}, 'Time': {'total_time_sec': 7286.778720855713, 'avg_time_per_example_sec': 0.9180771980415412}, 'Latency': {'avg_latency_sec': 0.9426582863187556}, 'Throughput': {'throughput_examples_per_sec': 1.0616563279657598}}


In [19]:
!nvidia-smi

Tue Nov 19 08:15:20 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.02              Driver Version: 566.03         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti     On  |   00000000:01:00.0 Off |                  N/A |
|  0%   41C    P8              4W /  285W |    7902MiB /  12282MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
