# Installing unsloth

In [None]:
# it was installed in our venv...
# pip3 install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"

# Run these commands if you are using google colab.
%%capture
!pip3 install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip3 uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Process the TeleQnA Dataset to create and save train datasets

Processing TeleQnA dataset to produces datasets to fine tunne a model and then test it.

# Dataset TeleQnA
We will load json files that we generated and create dataset to fine-tunning the model.

## Questions Release 17

## Dataset with 4000 questions

### Questions Release 17

In [None]:
import json

# Path to the TeleQnA processed question in JSON file with only Rel 17...
rel17_questions_path = r"files/rel17_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_questions_path, "r", encoding="utf-8") as file:
    rel17_questions = json.load(file)
print(len(rel17_questions))

# Path to the TeleQnA processed question in JSON file with only Rel 17 and 100 questions...
rel17_100_questions_path = r"files/rel17_100_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_100_questions_path, "r", encoding="utf-8") as file:
    rel17_100_questions = json.load(file)
print(len(rel17_100_questions))

rel17_other_questions = [q for q in rel17_questions if q not in rel17_100_questions]
print(len(rel17_other_questions))

rel17_other_questions_length = 500
rel17_other_questions = rel17_other_questions[:rel17_other_questions_length]
print(len(rel17_other_questions))
rel17_other_questions[0]

### Questions without Rel 17 and 18

In [None]:
# Path to the TeleQnA processed question in JSON file with questions without Rel 17 and 18...
no_rel_17_18_path_questions = r"files/no_rel_17_18_questions.json"

# Load the TeleQnA data...
with open(no_rel_17_18_path_questions, "r", encoding="utf-8") as file:
    no_rel_17_18_questions = json.load(file)
print(len(no_rel_17_18_questions))

no_rel_17_18_questions_length = 3500
no_rel_17_18_questions = no_rel_17_18_questions[:no_rel_17_18_questions_length]
print(len(no_rel_17_18_questions))
no_rel_17_18_questions[0]

### Train data

In [None]:
train_questions = rel17_other_questions + no_rel_17_18_questions
print(len(train_questions))
train_questions[0]

We create two datasets, one with no options and half of questions and another wit options and the other half.

In [None]:
from datasets import Dataset

# Structure to store pairs of questions and explanations
data = []

half_questions = len(train_questions)//2

# Fill the dataset with (question, explanation) pairs
for item in train_questions[:half_questions]:

    human_value = (
        f"{item['question']}"
    )

    # Combine the answer and explanation
    gpt_value = (
        f"{item['explanation']}"
    )

    # Create a dictionary for each input pair
    pair = [
        {'from': 'human', 'value': human_value},  # For the question
        {'from': 'gpt', 'value': gpt_value}  # For the explanation
    ]

    data.append(pair)  # Add the pair to the dataset

data_no_options = data
print(data_no_options[0])

In [None]:
from datasets import Dataset

# Structure to store pairs of questions, options, answers, and explanations
data = []

# Fill the dataset with (question + options, answer + explanation) pairs
for item in train_questions[half_questions:]:

    # Extract options
    options = [f"{key}: {value}" for key, value in item.items() if 'option' in key]
    # Combine the question and options
    human_value = (
        f"Question: {item['question']}\n"
        f"Options:\n" + "\n".join(options) + "\n"
    )

    # Combine the answer and explanation
    gpt_value = (
        f"Answer: {item['answer']}\n"
        f"Explanation: {item['explanation']}"
    )

    # Create a dictionary for each input pair
    pair = [
        {'from': 'human', 'value': human_value},  # Question with options
        {'from': 'gpt', 'value': gpt_value}       # Answer with explanation
    ]

    data.append(pair)  # Add the pair to the dataset

# Create the dataset using Hugging Face
data_options = data
print(data_options[0])

Then we join these datasets and shuffle randomly.

In [None]:
import random

data_total = data_no_options + data_options
# Shuffle the combined data
random.shuffle(data_total)
print(len(data_total))

Convert the list of pairs into the appropriate format Transform the data into a Dataset Save dataset on the disk

In [None]:
formatted_data = {'conversations': data_total}
dataset = Dataset.from_dict(formatted_data)

print(dataset)
print(dataset[0])

dataset.save_to_disk('files/train_questions_dataset_4000_questions')

## Dataset with 4000 questions: answer label

## Dataset with 4000 questions: short answer label

# Running fine tuning

In [1]:
import torch
print("Torch Version: ", torch.__version__)
print("Torch Version Cuda: ", torch.version.cuda)
print("Torch Version cuDnn: ", torch.backends.cudnn.version())
print("CUDA Available: ", torch.cuda.is_available())
print("CUDA Device Name: ", torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

# Verificar se CUDA está disponível para acelerar o processamento
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")

Torch Version:  2.5.0+cu124
Torch Version Cuda:  12.4
Torch Version cuDnn:  90800
CUDA Available:  True
CUDA Device Name:  NVIDIA RTX A1000 6GB Laptop GPU
Usando dispositivo: cuda


In [2]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

In [4]:
initial_model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name="unsloth/Llama-3.2-3B-bnb-4bit",
    model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-3B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # device_map="auto"
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = initial_model

==((====))==  Unsloth 2025.3.17: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA RTX A1000 6GB Laptop GPU. Num GPUs = 1. Max memory: 5.681 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [8]:
# move model to CPU before delete
# model.to("cpu")
# delete reference to the model
# del model

# import gc and call garbage collect to free orphan objects
# import gc
# gc.collect()

# Free GPU memor
# torch.cuda.empty_cache()

In [5]:
initial_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
 

# Load it from the disk

In [26]:
from datasets import load_from_disk

dataset_path = 'files/train_questions_dataset_4000_questions'

raw_dataset = load_from_disk(dataset_path)

print(len(raw_dataset))
print(raw_dataset)
print(raw_dataset[0])
print(raw_dataset[5])

4000
Dataset({
    features: ['conversations'],
    num_rows: 4000
})
{'conversations': [{'from': 'human', 'value': 'Question: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibit?\nOptions:\noption 1: It exhibits linear growth of entropy with the entropy rate.\noption 2: It exhibits irreversible dynamics.\noption 3: It exhibits time-dependent entropy rates.\noption 4: It exhibits a stationary stochastic process.\n'}, {'from': 'gpt', 'value': 'Answer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.'}]}
{'conversations': [{'from': 'human', 'value': 'Question: What does SMI stand for? [TCP/IP]\nOptions:\noption 1: Secure Management Information\noption 2: Simple Management Interface\noption 3: System Management Infrastructure\noption 4: Session Management In

# Format TeleQnA Dataset for training

In [59]:
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import standardize_sharegpt

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

std_dataset = standardize_sharegpt(raw_dataset)
dataset = std_dataset.map(formatting_prompts_func, batched = True,)

print(dataset)

dataset[0]

Dataset({
    features: ['conversations'],
    num_rows: 4000
})
Dataset({
    features: ['conversations', 'text'],
    num_rows: 4000
})


{'conversations': [{'content': 'Question: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibit?\nOptions:\noption 1: It exhibits linear growth of entropy with the entropy rate.\noption 2: It exhibits irreversible dynamics.\noption 3: It exhibits time-dependent entropy rates.\noption 4: It exhibits a stationary stochastic process.\n',
   'role': 'user'},
  {'content': 'Answer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.',
   'role': 'assistant'}],
 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary d

# Training

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.17 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [29]:
# if necessary run command below to delete model
# del model

In [61]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq, DataCollatorWithPadding
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt"),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 16,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 300,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [62]:
trainer.train_dataset[0].items()

dict_items([('conversations', [{'content': 'Question: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibit?\nOptions:\noption 1: It exhibits linear growth of entropy with the entropy rate.\noption 2: It exhibits irreversible dynamics.\noption 3: It exhibits time-dependent entropy rates.\noption 4: It exhibits a stationary stochastic process.\n', 'role': 'user'}, {'content': 'Answer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.', 'role': 'assistant'}]), ('text', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: What behavior does a time-invariant Markov chain with an initial state drawn according to the station

In [63]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map (num_proc=20):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [64]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibit?\nOptions:\noption 1: It exhibits linear growth of entropy with the entropy rate.\noption 2: It exhibits irreversible dynamics.\noption 3: It exhibits time-dependent entropy rates.\noption 4: It exhibits a stationary stochastic process.\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAnswer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.<|eot_id|>'

In [65]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[0]["labels"]])

'                                                                                                           Answer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.<|eot_id|>'

# Show current memory stats

In [56]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX A1000 6GB Laptop GPU. Max memory = 5.681 GB.
3.711 GB of memory reserved.


# Initial Loss

In [69]:
trainer.train_dataset[0].items()

dict_items([('conversations', [{'content': 'Question: What behavior does a time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibit?\nOptions:\noption 1: It exhibits linear growth of entropy with the entropy rate.\noption 2: It exhibits irreversible dynamics.\noption 3: It exhibits time-dependent entropy rates.\noption 4: It exhibits a stationary stochastic process.\n', 'role': 'user'}, {'content': 'Answer: option 4: It exhibits a stationary stochastic process.\nExplanation: A time-invariant Markov chain with an initial state drawn according to the stationary distribution exhibits a stationary stochastic process.', 'role': 'assistant'}]), ('text', '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nQuestion: What behavior does a time-invariant Markov chain with an initial state drawn according to the station

In [68]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Configura o collator e DataLoader
collator = DataCollatorForSeq2Seq(tokenizer)
# collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
loader = DataLoader(trainer.train_dataset.remove_columns(["conversations", "text"]),
                    batch_size=2,  # Tamanho do batch escolhido
                    collate_fn=collator,
                    num_workers=2)

print(loader)

# Variáveis para armazenar a loss total e o número de exemplos
total_loss = 0.0
num_batches = 0

# Coloca o modelo em modo de avaliação
model.eval()

# Desativa o cálculo de gradiente para economizar memória
with torch.no_grad():
    for batch in tqdm(loader, desc="Calculating initial loss"):
        # Move o batch para a GPU (se disponível)
        batch = {k: v.to(model.device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Acumula a loss
        total_loss += outputs.loss.item()
        num_batches += 1

# Calcula a loss média
average_loss = total_loss / num_batches
print(f"Initial mean loss: {average_loss}")

<torch.utils.data.dataloader.DataLoader object at 0x7f43eede3170>


Calculating initial loss:   1%|██▏                                                                                                                                                                           | 25/2000 [00:11<14:55,  2.21it/s]


KeyboardInterrupt: 

# Training Session

In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,000 | Num Epochs = 3 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192/1,247,086,592 (0.90% trained)


Step,Training Loss
1,2.077
2,1.8527
3,1.9798
4,1.8367
5,1.5979
6,1.3574
7,1.1842
8,1.3202
9,1.226
10,0.9419


Unsloth: Will smartly offload gradients to save VRAM!


# Show final memory and time stats

In [19]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

model

1145.0392 seconds used for training.
19.08 minutes used for training.
Peak reserved memory = 5.051 GB.
Peak reserved memory for training = 2.686 GB.
Peak reserved memory % of max memory = 88.91 %.
Peak reserved memory for training % of max memory = 47.28 %.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

# Loss Pos training

In [20]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Configure the collator and DataLoader
collator = DataCollatorForSeq2Seq(tokenizer)
loader = DataLoader(
    trainer.train_dataset.remove_columns(["conversations", "text"]),
    batch_size=2,  # Chosen batch size
    collate_fn=collator,
    num_workers=2
)

# Variables to store the total loss and the number of valid batches
total_loss = 0.0
num_batches = 0

# Set the model to evaluation mode
model.eval()

# Disable gradient calculation to save memory
with torch.no_grad():
    for batch in tqdm(loader, desc="Calculating loss post-training"):
        # Move the batch to GPU (if available)
        batch = {k: v.to(model.device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)

        # Check if the loss is NaN
        if not torch.isnan(outputs.loss):
            total_loss += outputs.loss.item()
            num_batches += 1
            # print(outputs.loss.item())

print(f"Number of valid batches: {num_batches}")

# Calculate the average loss
if num_batches > 0:
    average_loss = total_loss / num_batches
    print(f"Post-training mean loss: {average_loss}")
else:
    print("No valid batches found.")

Calculating loss post-training:   0%|                                                                                                                                                                                 | 0/2000 [00:00<?, ?it/s]


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 777, in convert_to_tensors
    tensor = as_tensor(value)
             ^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 739, in as_tensor
    return torch.tensor(value)
           ^^^^^^^^^^^^^^^^^^^
RuntimeError: Could not infer dtype of dict

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/data/data_collator.py", line 682, in __call__
    batch = pad_without_fast_tokenizer_warning(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/data/data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 3397, in pad
    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 241, in __init__
    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
  File "/home/johny/environment/repositories/3gpp_llm_evaluation_rep/.venv/lib/python3.12/site-packages/transformers/tokenization_utils_base.py", line 793, in convert_to_tensors
    raise ValueError(
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`conversations` in this case) have excessive nesting (inputs type `list` where type `int` is expected).


# Inference

## Question with option

In [None]:
dataset[5]['conversations'][0]
question = dataset[5]['conversations'][0]['value']

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    # {"role": "user", "content": "How much is 1+1?"},
    {"role": "user", "content": question},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

## Question with no option

In [None]:
dataset[0]['conversations'][0]['value']
question = dataset[0]['conversations'][0]['value']

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    # {"role": "user", "content": "How much is 1+1?"},
    {"role": "user", "content": question},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

# Save model

In [None]:
model.save_pretrained("../models/llama_3.2_1B_FT_lora_4000_questions", safe_serialization=False)
tokenizer.save_pretrained("../models/llama_3.2_1B_FT_lora_4000_questions")

## Save merged

In [None]:
model.save_pretrained_merged("../models/llama_3.2_1B_FT_lora_4bits_4000_questions", tokenizer, save_method = "merged_4bit",)

# Load Model

In [None]:
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
model_path = "../models/llama_3.2_1B_FT_lora_4000_questions"
# model_path = "model_3.2_lora_4bits"

# Carregar o modelo e o tokenizador separadamente
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=max_seq_length,
    dtype = dtype,
    load_in_4bit=load_in_4bit
)