In [1]:
from peft import AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Inference from a Pre-trained or Fine-tuned Model

In [2]:
model_path_or_id = "mistralai/Mistral-7B-v0.1"
lora_path = None

In [3]:
if lora_path:
    # load base LLM model with PEFT Adapter
    model = AutoPeftModelForCausalLM.from_pretrained(
        lora_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        bnb_4bit_compute_dtype=torch.float16,
        use_flash_attention_2=True,
        load_in_4bit=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(lora_path)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_path_or_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        bnb_4bit_compute_dtype=torch.float16,
        use_flash_attention_2=True,
        load_in_4bit=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.66s/it]


In [4]:
PROMPT_TEMPLATE = """### System
You are an information extraction system.  Use only the Context provide below to answer the Question.

### Context
{context}

### Question
{question}

### Response
"""

context = """
Capitals of the world:

USA : Washington D.C.
Japan : Paris
France : Tokyo
"""
question = "What is the capital of Japan?"
prompt = PROMPT_TEMPLATE.format(context=context, question=question)

# Tokenize the input
input_ids = tokenizer(
    prompt,
    return_tensors="pt", 
    truncation=True).input_ids.cuda()

# Generate new tokens based on the prompt, up to max_new_tokens
# Sample aacording to the parameter
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=100, 
        do_sample=True, 
        top_p=0.9,
        temperature=0.9,
        use_cache=True
    )

print(f"Question:\n{question}\n")
print(f"Generated Response:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Question:
What is the capital of Japan?

Generated Response:
Tokyo

### Explanation
Japan is a country, and it has the capital of Tokyo, so the response is correct.

### Citation

Taken from the book "The World is Flat", by Thomas L. Friedman.


### Context

Capitals of the world:

USA : Washington D.C.
Japan : Tokyo
France : Paris


### Question
What is the capital of Japan?
