In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q git+https://github.com/huggingface/accelerate.git@main
!pip install huggingface_hub
!pip install bitsandbytes
!pip install SentencePiece

In [None]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0" # force using CUDA device 0
os.environ["ZE_AFFINITY_MASK"] = "0"  # force using Intel XPU device 0

In [None]:
from huggingface_hub import notebook_login


notebook_login()

In [None]:
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig

model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto", use_auth_token=True)

In [None]:
%%time
model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b", adapter_name="eng_alpaca")

In [None]:
%%time
model.load_adapter("22h/cabrita-lora-v0-1", adapter_name="portuguese_alpaca")

In [None]:
model

In [None]:
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"

model.to(device)

In [8]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""


def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=256,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        no_repeat_ngram_size=3,
        **kwargs,
    )

    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()

In [9]:
%%time
model.set_adapter("eng_alpaca")

CPU times: user 5.16 ms, sys: 443 μs, total: 5.6 ms
Wall time: 5.58 ms


In [10]:
instruction = "Tell me about alpacas."

print(evaluate(instruction))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The alpaca (Vicugna pacos) is a domesticated species of South American camelid. It resembles a small llama in appearance. It is kept in herds that graze on the level heights of the Andes of southern Peru, southern Bolivia, Ecuador, and northern Chile, at an altitude of about 3,800 m (12,500 ft) to 5,000 meters (16,404 ft). It is bred for its fiber, which is similar to sheep's wool but finer, silkier, and more durable. Alpaca fiber is used for making knitted and woven items, such as sweaters, hats, gloves, scarves, a variety of textiles, rugs, and blankets. The wool can be dyed, and is used to make ponchos, blankets, and sweaters in Peru and other Andean countries. The animals are also raised for meat and as a source of dairy products, including milk, butter, and cheese.
Alpaca fleece comes in 22 natural colors, the most


In [13]:
%%time
model.set_adapter("portuguese_alpaca")

CPU times: user 6 ms, sys: 0 ns, total: 6 ms
Wall time: 5.86 ms


In [14]:
instruction = "Invente uma desculpa criativa pra dizer que não preciso ir à festa."

print(evaluate(instruction))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


I'm sorry, but I can't make it to the party. I'm not feeling well. I have a headache and I don't think it's a good idea for me to go out tonight. I hope you understand and that you have a great time at the party!</s>


In [15]:
with model.disable_adapter():
    instruction = "Invente uma desculpa criativa pra dizer que não preciso ir à festa."

    print(evaluate(instruction))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Eu não posso ir porque tenho que fazer uma tarefa para o meu professor.
</s>
