# Libs

In [None]:
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install scikit-learn

In [None]:
from unsloth import tokenizer_utils
def do_nothing(*args, **kwargs):
    pass
tokenizer_utils.fix_untrained_tokens = do_nothing

In [None]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
print(f"Major: {major_version}, Minor: {minor_version}")
from datasets import load_dataset
import datasets
from trl import SFTTrainer
import pandas as pd
import numpy as np
import os
import pandas as pd
import numpy as np
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from typing import Tuple
import warnings
from typing import Any, Dict, List, Union
from transformers import DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

max_seq_length = 2048 
dtype = None 

models_list = [
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/Qwen2-7B-bnb-4bit",
    "unsloth/gemma-2-9b-it-bnb-4bit",
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
]

load_in_4bit = True

model_name = models_list[0]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    load_in_4bit = load_in_4bit,
    max_seq_length = max_seq_length,
    dtype = dtype,
)

In [None]:
yes_token_id = tokenizer.encode("Sim", add_special_tokens=False)[0]
no_token_id = tokenizer.encode("Não", add_special_tokens=False)[0]
# keep only the yes and no tokens from lm_head
par = torch.nn.Parameter(torch.vstack([model.lm_head.weight[no_token_id, :], model.lm_head.weight[yes_token_id, :]]))
print(par.shape)
print(model.lm_head.weight.shape)
model.lm_head.weight = par

In [None]:
from peft import LoftQConfig

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "lm_head", # can easily be trained because it has only 2 tokens
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    # init_lora_weights = 'loftq',
    # loftq_config = LoftQConfig(loftq_bits = 4, loftq_iter = 1), # And LoftQ
)
print("trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

with open("base_de_dados/dataset_llm.jsonl", "r", encoding="utf-8") as f:
    data = pd.read_json(f, lines=True)

def extrair_campos(messages):
    user_text = None
    assistant_text = None
    for mensagem in messages:
        if mensagem['role'] == 'user':
            user_text = mensagem['content']
        elif mensagem['role'] == 'assistant':
            assistant_text = mensagem['content']
    return user_text, assistant_text

data[['text', 'assistant']] = data['messages'].apply(
    lambda msgs: pd.Series(extrair_campos(msgs))
)

def converter_label(resposta):
    resposta = resposta.strip().lower()
    if resposta == 'similar':
        return 1
    elif resposta == 'diferente':
        return 0
    else:
        return None

data['label'] = data['assistant'].apply(converter_label)

final_df = data[['text', 'label']]

train_size = 29368
val_size = 7342

data_sample = final_df.sample(n=train_size + val_size, random_state=24)

train_df, val_df = train_test_split(
    data_sample, test_size=val_size/len(data_sample), random_state=24
)

print("len(train_df):", len(train_df))
print("len(val_df):", len(val_df))
print("Test size ratio:", val_size/len(data_sample))


In [None]:
print(val_df)

In [None]:
token_counts = [len(tokenizer.encode(x)) for x in train_df.text]
# plot the token counts
a = plt.hist(token_counts, bins=30)

In [None]:
train_dataset = datasets.Dataset.from_pandas(train_df,preserve_index=False)
train_dataset

prompt = """Aqui estão duas marcas:  
{}  

Essas duas marcas são similares? Responda apenas com "Sim" ou "Não".  

SOLUÇÃO  
A resposta correta é: "{}"""

positivelabel = "Sim"
negativelabel = "Não"


def formatting_prompts_func(dataset_):
    # this is to fix an issue with a certain transformers version, you might not need this
    if isinstance(dataset_['text'], str):
        if model_name.lower().__contains__("qwen"):
            return [""]*100
        elif model_name.lower().__contains__("llama"):
            return " "
        else:
            return " "

    texts = []
    for i in range(len(dataset_['text'])):
        t = dataset_['text'][i]
        label = positivelabel if dataset_['label'][i] == 1 else negativelabel
        text = prompt.format(t, label)

        texts.append(text)
    return texts

In [None]:
# this custom collator is needed to change the sequence labels from yes_token_id and no_token_id to 1 and 0.
# It also trains only on the last token of the sequence.
class DataCollatorForLastTokenLM(DataCollatorForLanguageModeling):
    def __init__(
        self,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)
        self.ignore_index = ignore_index

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            # Find the last non-padding token
            last_token_idx = (batch["labels"][i] != self.ignore_index).nonzero()[-1].item()
            # Set all labels to ignore_index except for the last token
            batch["labels"][i, :last_token_idx] = self.ignore_index
            # The old labels for the Yes and No tokens need to be mapped to 1 and 0
            batch["labels"][i, last_token_idx] = 1 if batch["labels"][i, last_token_idx] == yes_token_id else 0


        return batch
collator = DataCollatorForLastTokenLM(tokenizer=tokenizer)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
#APLICAR EALYR STOPPING
from transformers import TrainerCallback, TrainingArguments
import torch

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, threshold=0.0):
        self.threshold = threshold

    def on_step_end(self, args, state, control, **kwargs):
        """Verifica se a loss atingiu o limite e para o treinamento."""
        if state.log_history and "loss" in state.log_history[-1]:  # Checa se há logs de loss
            current_loss = state.log_history[-1]["loss"]
            print(f"Passo {state.global_step}: Loss = {current_loss:.6f}")

            if current_loss <= self.threshold:
                print(f"🎯 Early Stopping ativado! Loss atingiu {current_loss:.6f}. Encerrando o treinamento...")
                control.should_training_stop = True  # Para o treinamento

# Adicionando o callback no SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # not needed because group_by_length is True
    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        learning_rate=1e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="outputs",
        num_train_epochs=1,
        max_steps=100,
        report_to="none",
        group_by_length=True,
    ),
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(threshold=0.0)],  # Adiciona o Early Stopping
)


In [None]:
trainer_stats = trainer.train()

In [None]:
import matplotlib.pyplot as plt

# Extrai os logs do treinamento
logs = trainer.state.log_history

# Filtra os logs que contenham a métrica de perda (loss)
steps = [log["step"] for log in logs if "loss" in log]
losses = [log["loss"] for log in logs if "loss" in log]

# Cria o gráfico
plt.figure(figsize=(10, 6))
plt.plot(steps, losses, marker="o", label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Gráfico da Loss durante o Treinamento")
plt.legend()
plt.grid(True)

plt.savefig(f"Llama-3-2-1B_loss.jpeg")

plt.show()

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
saved_name = f"lora_model_{model_name.replace('/','_')}"
model.save_pretrained(saved_name)

In [None]:
from collections import defaultdict
import torch.nn.functional as F
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, auc
import torch.nn.functional as nnf

# Step 1: Tokenize the inputs and sort them by their tokenized length
tokenized_inputs = []
for i in range(len(val_df['text'])):
    text = val_df['text'].iloc[i]
    test_str = prompt.format(text, "")
    tokenized_input = tokenizer(test_str, return_tensors="pt", add_special_tokens=False)
    tokenized_inputs.append((tokenized_input, test_str, val_df['label'].iloc[i]))

# Sort by tokenized length
tokenized_inputs.sort(key=lambda x: x[0]['input_ids'].shape[1])

# Step 2: Group the inputs by their tokenized length
grouped_inputs = defaultdict(list)
for tokenized_input, test_str, label in tokenized_inputs:
    length = tokenized_input['input_ids'].shape[1]
    grouped_inputs[length].append((tokenized_input, test_str, label))

# Step 3: Process each group in batches of 64
batch_size = 64
all_outputs = []
all_strings = []
all_labels = []
all_probabilities = []

from tqdm import tqdm
for length, group in tqdm(grouped_inputs.items()):
    for i in range(0, len(group), batch_size):
        batch = group[i:i+batch_size]
        batch_inputs = [item[0] for item in batch]
        batch_strings = [item[1] for item in batch]
        batch_labels = [item[2] for item in batch]

        # Concatenate the batch inputs
        input_ids = torch.cat([item['input_ids'] for item in batch_inputs], dim=0).to("cuda")
        attention_mask = torch.cat([item['attention_mask'] for item in batch_inputs], dim=0).to("cuda")

        # Forward pass
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # print(outputs.logits[:, -1].shape)

        # Get logits for the first token prediction (assuming binary classification)
        logits = outputs.logits[:, -1, :2]  # Only consider logits for 0 and 1

        # Apply softmax
        probabilities = F.softmax(logits, dim=-1)
        test = F.softmax(logits, dim=1)[:,1]
        # print("test")
        # print(test.shape)
        # top_p, top_class = probabilities.topk(1, dim=-1)
        value = [test.cpu().numpy() for t in test]
        for x in value[0]:
        #   print(x)
          all_probabilities.append(x)

        # Get predictions
        predictions = torch.argmax(probabilities, dim=-1)

        all_outputs.extend(predictions.cpu().numpy())
        all_labels.extend(batch_labels)
        all_strings.extend(batch_strings)

# print("all_probabilities")
# print(all_probabilities)

all_labelst = torch.tensor(all_labels)
all_predst = torch.tensor(all_outputs)

# all_probabilities
all_probabilitiest = torch.tensor(all_probabilities)
# # ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(all_labelst.numpy(), all_probabilitiest.numpy())
roc_auc = auc(fpr, tpr)
print("ROC AUC: ", roc_auc)

# Step 4: Do the label assignment
# Confusion Matrix
cm = confusion_matrix(all_labelst.numpy(), all_predst.numpy())
# Accuracy
accuracy = accuracy_score(all_labelst.numpy(), all_predst.numpy())
# Precision
precision = precision_score(all_labelst.numpy(), all_predst.numpy())
# Recall
recall = recall_score(all_labelst.numpy(), all_predst.numpy())
# F1-Score
f1 = f1_score(all_labelst.numpy(), all_predst.numpy())


print("Confusion Matrix:")
print(cm)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-Score: ", f1)


In [None]:
print("all_labels")
print(all_labels)

print("all_outputs")
print(all_outputs)

all_labelst = torch.tensor(all_labels)
all_predst = torch.tensor(all_outputs)

# Step 4: Do the label assignment
# Confusion Matrix
cm = confusion_matrix(all_labelst.numpy(), all_predst.numpy())
# Accuracy
accuracy = accuracy_score(all_labelst.numpy(), all_predst.numpy())
# Precision
precision = precision_score(all_labelst.numpy(), all_predst.numpy())
# Recall
recall = recall_score(all_labelst.numpy(), all_predst.numpy())
# F1-Score
f1 = f1_score(all_labelst.numpy(), all_predst.numpy())
# ROC Curve and AUC
# fpr, tpr, thresholds = roc_curve(all_labelst.numpy(), all_predst.numpy())
roc_auc = auc(fpr, tpr)

# print("fpr")
# print(fpr)
# print("tpr")
# print(tpr)
# print("thresholds")
# print(thresholds)

print("Confusion Matrix:")
print(cm)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-Score: ", f1)
print("ROC AUC: ", roc_auc)

In [None]:
# print(fpr)
# print(tpr)

plt.figure(figsize=(8, 6))

plt.plot(fpr, tpr, label='ROC Curve')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
import os
import pickle

model_metrics = {
    "confusion_matrix": cm,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1
}

model_results = {
    "metrics": model_metrics,
    "roc": {
        "fpr": fpr,
        "tpr": tpr
    }
}

filename = "model_results.pkl"
if os.path.exists(filename):
    with open(filename, "rb") as file:
        results_dict = pickle.load(file)
else:
    results_dict = {}

results_dict[model_name] = model_results

with open(filename, "wb") as file:
    pickle.dump(results_dict, file)

print("Métricas e curvas ROC salvas com sucesso para o modelo:", model_name)


In [None]:
import pickle

filename = "model_results.pkl"

# Carregar o arquivo pickle
with open(filename, "rb") as file:
    results_dict = pickle.load(file)

# Exibir todas as métricas salvas
for model_result, data in results_dict.items():
    print(f"\nModelo: {model_result}")
    print("Métricas:")
    for metric, value in data["metrics"].items():
        print(f"  - {metric}: {value}")

    # print("\nCurva ROC:")
    # print(f"  - False Positive Rate (FPR): {data['roc']['fpr']}")
    # print(f"  - True Positive Rate (TPR): {data['roc']['tpr']}")
    print("-" * 50)
