### 1. Environment Preparation

In [1]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import json
from llama_index.core.schema import TextNode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from datasets import load_dataset
import os

from src.modeling import (
    PaddingSide,
    TruncateSide,
    PromptType,
    PromptTemplate,
)
from src.modeling.models import (
    LlamaConfig,
    LlamaTokenizer,
    LlamaModel,
)
from src.inference import (
    DecodeStrategy,
    InferenceConfig,
    InferenceAgent,
)

device = "cuda:3" # switch to your preferred device

  from .autonotebook import tqdm as notebook_tqdm


### 2. Loading Dataset and Models

In [2]:
# loading dataset
data = load_dataset("./data/RAG/mini-wikipedia")['train']
data

Dataset({
    features: ['passage', 'id'],
    num_rows: 3200
})

In [3]:
# loading embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="./model/bge-large-en-v1.5", device=device)
print("Embedding model loaded.")

# loading InferenceAgent
# base model
model_dir = "./model/llama_3.2_1b_instruct/"
config_file = os.path.join(model_dir, "config.json")
params_files = os.path.join(model_dir, "model.safetensors")
with open(config_file, "r") as f:
    config = json.load(f)
llama_config: LlamaConfig = LlamaModel.load_config(
    config_file, 
    param_device=device,
)
llama_model = LlamaModel(llama_config)
llama_model.load_parameters(params_files)
print("Base model loaded.")
# tokenizer
tokenizer_file = os.path.join(model_dir, "tokenizer.json")
tokenizer_config_file = os.path.join(model_dir, "tokenizer_config.json")
llama_tokenizer = LlamaTokenizer(
    vocab_file=tokenizer_file,
    config_file=tokenizer_config_file,
)
print("Tokenizer loaded.")
# generation config
generation_config_file = os.path.join(model_dir, "generation_config.json")
max_new_tokens = 100
sampling_seed = 42
inf_config = InferenceConfig(
    decode_strategy=DecodeStrategy.GREEDY,
    max_new_tokens=max_new_tokens,
    temperature=1.0,
    top_p=0.9,
    top_k=50,
    sampling_seed=sampling_seed,
    padding_side=PaddingSide.LEFT,
    pad_to_multiple_of=1,
    truncate_length=None,
    truncate_side=TruncateSide.RIGHT,
    device=device,
)
# building agent
inf_agent = InferenceAgent(
    config=inf_config,
    model=llama_model,
    tokenizer=llama_tokenizer,
)
print("InferenceAgent loaded.")

Embedding model loaded.
Base model loaded.
Tokenizer loaded.
InferenceAgent loaded.


### 3. Retrieval Corpus Preparation

In [4]:
nodes = []
for value in data:
    nodes.append(TextNode(text=value['passage'], id_=str(value['id'])))
index = VectorStoreIndex(nodes)
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1, # select the most relevant document
)
Settings.llm = None
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
)

LLM is explicitly disabled. Using MockLLM.


### 4. Retrieval

In [5]:
query = "How was Montevideo founded?"
retrieved_chunk = query_engine.retrieve(query)[0].text
print(retrieved_chunk)

Montevideo was founded by the Spanish in the early 18th century as a military stronghold. Uruguay won its independence in 1828 following a three-way struggle between Spain, Argentina and Brazil. It is a constitutional democracy, where the president fulfills the roles of both head of state and head of government


### 5. Generation

In [6]:
system_prompt_template = PromptTemplate(
    template_str="You're a helpful assitant on {subject}. You will be given a question and a corresponding paragraph that might be useful in answering the question. You should first decide whether the information is indeed helpful, then give your final answer to the question based on the information if it is useful (or otherwise, just answer based on your own knowledge).\n",
)
context_prompt_template = PromptTemplate(
    template_str="Information that might be useful: {information}.\n",
)

subject = "Question Anwering"

inf_agent.set_prompt(
    prompt_template=system_prompt_template,
    prompt_type=PromptType.SYSTEM,
)
inf_agent.set_prompt(
    prompt_template=context_prompt_template,
    prompt_type=PromptType.CONTEXT,
)

prompt_dicts = inf_agent(query, subject=subject, information=retrieved_chunk)
for i, prompt_dict in enumerate(prompt_dicts):
    print(f"\n{'='*25} The {i}-th sample in the batch {'='*25}")
    for prompt_type, promp in prompt_dict.items():
        print(f"\n[{prompt_type}]: {promp}")



[PromptType.SYSTEM]: You're a helpful assitant on Question Anwering. You will be given a question and a corresponding paragraph that might be useful in answering the question. You should first decide whether the information is indeed helpful, then give your final answer to the question based on the information if it is useful (or otherwise, just answer based on your own knowledge).


[PromptType.CONTEXT]: Information that might be useful: Montevideo was founded by the Spanish in the early 18th century as a military stronghold. Uruguay won its independence in 1828 following a three-way struggle between Spain, Argentina and Brazil. It is a constitutional democracy, where the president fulfills the roles of both head of state and head of government.


[PromptType.QUERY]: How was Montevideo founded?

[PromptType.PROMPT]: You're a helpful assitant on Question Anwering. You will be given a question and a corresponding paragraph that might be useful in answering the question. You should fir