In [None]:
# Instalar as dependências necessárias
!pip install torch transformers peft huggingface_hub accelerate datasets bitsandbytes

# Importar as bibliotecas necessárias
from huggingface_hub import login, snapshot_download

# Fazer login no Hugging Face (substitua pelo seu token)
login(token="Seu tokken")

# Baixar o modelo Llama 3.2-1B-Instruct
model_path = snapshot_download(repo_id="meta-llama/Llama-3.2-1B-Instruct", local_dir="Llama-3.2-1B-Instruct")

print(f"Modelo baixado em: {model_path}")




Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

USE_POLICY.md:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

original/params.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

consolidated.00.pth:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

Modelo baixado em: /content/Llama-3.2-1B-Instruct


In [None]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from typing import Tuple, Dict, Any

def load_model_and_tokenizer(model_path: str, torch_dtype: torch.dtype) -> Tuple[AutoTokenizer, AutoModelForCausalLM]:
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch_dtype,
        device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)
    model.eval()
    return tokenizer, model

def apply_qlora(model: AutoModelForCausalLM, rank: int, alpha: int, lora_dropout: float,
                target_modules: list, bias: str, task_type: str) -> AutoModelForCausalLM:
    qlora_config = LoraConfig(
        r=rank,
        lora_alpha=alpha,
        target_modules=target_modules,
        lora_dropout=lora_dropout,
        bias=bias,
        task_type=task_type
    )
    model = get_peft_model(model, qlora_config)
    model.eval()
    return model

def generate_response(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompt: str,
                      generation_config: Dict[str, Any]) -> str:
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.inference_mode():
        outputs = model.generate(**inputs, **generation_config)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    del inputs, outputs
    gc.collect()
    return response

def main():
    model_path = "meta-llama/Llama-3.2-1B"
    torch_dtype = torch.float16
    bias = "none"
    task_type = "CAUSAL_LM"
    target_modules = ["q_proj", "v_proj"]
    rank = 8
    alpha = 16
    lora_dropout = 0.5

    tokenizer, model = load_model_and_tokenizer(model_path, torch_dtype)
    model = apply_qlora(model, rank, alpha, lora_dropout, target_modules, bias, task_type)

    generation_config = {
        "temperature": 0.1,
        "do_sample": True,
        "top_p": 0.9,
        "pad_token_id": tokenizer.eos_token_id,
        "early_stopping": False,
        "max_new_tokens": 50
    }

    model.save_pretrained("./modelo_fine")
    tokenizer.save_pretrained("./modelo_fine")

    prompt = input("Pergunta: ")
    response = generate_response(model, tokenizer, prompt, generation_config)
    print(response)

    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Pergunta: Entre as etapas do processo licitatório, qual costuma demandar mais tempo para análise?
Entre as etapas do processo licitatório, qual costuma demandar mais tempo para análise? A resposta é sim, mas não é o que se imagina. O processo licitatório é um processo que tem início com a publicação do edital de licitação e termina com a assinatura do contrato. A partir
