In [1]:
import torch

from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

_ = load_dotenv()

def collect() -> None:
    import gc
    gc.collect() # garbage collector
    torch.cuda.empty_cache() # updates nvidia-smi

  from .autonotebook import tqdm as notebook_tqdm


#### Load a model

In [2]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_safetensors=True,
        device_map="cuda" if torch.cuda.is_available() else "cpu"
    )

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, use_safetensors=True)

Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.12s/it]


#### Generation

In [4]:
conversation = [
    {"role": "user", "content": "Who is agent 005? Answer in one sentence."}
]

input_ids = tokenizer.apply_chat_template(
    conversation=conversation,
    add_generation_prompt=True,
    return_tensors="pt").to("cuda")

gen_tokens = model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.1,
    )

gen_text = tokenizer.decode(gen_tokens[0])
print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is agent 005? Answer in one sentence.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I couldn't find any information on a well-known character or agent named "Agent 005." However, there is a character named "Number 5" or "Agent 005" in the 1967 James Bond film "You Only Live Twice," played by actor Donald Pleasence, but he is actually known as "Kissy Suzuki's" brother and a Japanese agent.<|eot_id|>


#### RAG

In [7]:
del model; collect()

model_id = "CohereForAI/c4ai-command-r-v01-4bit"

model = AutoModelForCausalLM.from_pretrained(
        model_id,
        use_safetensors=True,
        device_map="cuda" if torch.cuda.is_available() else "cpu"
    )

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, use_safetensors=True)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 5/5 [00:04<00:00,  1.11it/s]


In [None]:
input_ids = tokenizer.apply_chat_template(
    conversation=conversation,
    chat_template="default",
    return_tensors="pt").to("cuda")

gen_tokens = model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.1,
    )

gen_text = tokenizer.decode(gen_tokens[0])
print(gen_text)

<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who is agent 005? Answer in one sentence.<|END_OF_TURN_TOKEN|>Agent 005 is a character in the James Bond series and is one of MI6's agents, but his role is often overshadowed by Bond's missions.<|END_OF_TURN_TOKEN|>


#### Provide documents

In [14]:
documents = [
    {
        "title": "License to Quack", 
        "text": (
            "Agent 005, Duffy Duck, adjusted his bow tie as he waddled into the Monaco Casino, his webbed feet barely making a sound.\n"
            "Across the room, Jacques “The Feather” Plume sat with his pet falcon, unaware his microchip smuggling operation was about to be quacked.\n"
            "Unlike his suave colleague, 007, Duffy wasn’t one for martinis or smooth one-liners—he preferred action.\n"
            "As his gadget-packed cufflink magnetized the chip from the falcon’s beak mid-distraction, he smirked.\n"
            "“Move over, Bond,” he muttered, slipping out as chaos erupted. “The world’s got room for one more double-zero.”"
            )
    },
    {
        "title": "Carrot Confidential",
        "text": (
            "Agent 005, Bugs Bunny, lounged casually in a Havana cigar bar, nibbling a carrot as he waited for Dr. Harebrained,\n"
            "the notorious inventor plotting to flood the market with mind-controlling chewing gum.\n"
            "When the villain finally appeared, Bugs tipped his fedora and muttered, “Eh, what’s up, Doc?” before slipping a tracker into Harebrained’s pocket with a sleight of hand.\n"
            "Moments later, chaos erupted as Bugs activated a remote that detonated every stick of gum in Harebrained’s lab. \n"
            "As security swarmed, Bugs nonchalantly strolled out, munching his carrot. “Another nutty scheme nipped in the bud,” he quipped."
            )
        
    }
]

input_ids = tokenizer.apply_chat_template(
    conversation=conversation,
    documents=documents,
    add_generation_prompt=True,
    chat_template="rag",
    return_tensors="pt").to("cuda")

gen_tokens = model.generate(
    input_ids,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.1,
    )

gen_text = tokenizer.decode(gen_tokens[0])
print(tokenizer.batch_decode(gen_tokens[:, input_ids.shape[1]:])[0]) # print only the assistant's response

Relevant Documents: 0,1
Cited Documents: 0,1
Grounded answer: Agent 005 is either <co: 0>Duffy Duck</co: 0>, who works alongside <co: 0>007</co: 0> at the <co: 0>Monaco Casino</co: 0>, or <co: 1>Bugs Bunny</co: 1>, who foils schemes in <co: 1>Havana.</co: 1><|END_OF_TURN_TOKEN|>
