<a href="https://colab.research.google.com/github/institutoazmina/ia-feminista-elas-no-congresso/blob/main/modelos_tema/bert/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
%%capture
# Instala as bibliotecas necessárias para fine tuning do modelo Llama
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install triton
!pip install xformers

In [5]:
# Carrega o modelo e o Tokenizer
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True # Usa 4bit quantization para reduzir o uso de memória.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
# Parâmetros do modelo
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.9.post3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
# Configura o prompt para fine-tuning
alpaca_prompt = """A seguir temos o resumo de um projeto de lei. Defina apenas o tema do projeto de lei somente entre as seguintes opções:
- genero
- direitos sociais
- economia,
- violencia contra a mulher,
- maternidade
- direitos sexuais e reprodutivos,
- dignidade sexual,
- politica,
- feminicidio

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Token de final de sequência (EOS)

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

'\ndef formatting_prompts_func(examples):\n    instructions = examples["instruction"]\n    outputs = examples["output"]\n\n    texts = []\n    for instruction, output in zip(instructions, outputs):\n        response = str(output)\n        text = alpaca_prompt.format(instruction, response) + EOS_TOKEN\n        texts.append(text)\n\n    return { "text": texts }\n'

In [8]:
# Importa o dataset
from datasets import load_dataset, Dataset
from huggingface_hub import login

login(token="colocar seu hf token")
dataset = load_dataset("belisards/ementas_anotadas")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


README.md:   0%|          | 0.00/718 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/152k [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/168 [00:00<?, ? examples/s]

In [9]:
# Configura os dados de treino
train = dataset['train']
train = train.rename_column("text", "instruction")
train = train.rename_column("label_tema", "output")
train = train.remove_columns([col for col in train.column_names if col not in ["instruction", "output"]])
train = train.filter(lambda example: example['output'] not in ['outros-extra', 'lei maria da penha'])

Filter:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [10]:
train= train.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [11]:
# Configura os dados de teste
dataset_test = dataset['test']
dataset_test = dataset_test.rename_column("text", "instruction")
dataset_test = dataset_test.rename_column("label_tema", "output")
dataset_test = dataset_test.remove_columns([col for col in dataset_test.column_names if col not in ["instruction", "output"]])
dataset_test = dataset_test.filter(lambda example: example['output'] not in ['outros-extra', 'lei maria da penha'])

Filter:   0%|          | 0/168 [00:00<?, ? examples/s]

In [12]:
dataset_test = dataset_test.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

In [14]:
#Set dos parâmetros e treino do modelo com early stopping

from transformers import TrainingArguments, EarlyStoppingCallback
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

early_stopping_patience = 3

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset= train,
    eval_dataset=dataset_test,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs= 15,
        learning_rate= 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        remove_unused_columns=True
    ),
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)
    ],
)

trainer.train()



Map (num_proc=2):   0%|          | 0/1119 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/168 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,119 | Num Epochs = 15
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 2,100
 "-____-"     Number of trainable parameters = 41,943,040


Epoch,Training Loss,Validation Loss
1,0.4654,0.487403
2,0.3416,0.480195
3,0.4717,0.496773
4,0.306,0.535868
5,0.1818,0.580063


TrainOutput(global_step=700, training_loss=0.3497239608636924, metrics={'train_runtime': 4595.2933, 'train_samples_per_second': 3.653, 'train_steps_per_second': 0.457, 'total_flos': 4.485393280961741e+16, 'train_loss': 0.3497239608636924, 'epoch': 5.0})

In [None]:
# Salva o modelo fine tunado localmente
model.save_pretrained("llama_model_ia-feminista")
tokenizer.save_pretrained("llama_model_ia-feminista")

('gemma_model/tokenizer_config.json',
 'gemma_model/special_tokens_map.json',
 'gemma_model/tokenizer.model',
 'gemma_model/added_tokens.json',
 'gemma_model/tokenizer.json')

In [None]:
# Push para o Hugging Face
model.push_to_hub("ia_feminista_tema" #hub do projeto, token = #"utilize o token do projeto")
tokenizer.push_to_hub("ia_feminista_tema" #hub do projeto, token = #"utilize o token do projeto")