In [2]:
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install torch torchvision torchaudio
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install accelerate peft bitsandbytes transformers trl

[INFO] Running in Google Colab, installing requirements.
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5

In [3]:
import requests
import fitz
from tqdm.auto import tqdm
from spacy.lang.en import English
import re
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device_Type: {device}')
import pandas as pd

Device_Type: cuda


In [4]:
def pdf_download(pdf_path:str, url:str):
# Download PDF if it doesn't already exist
  if not os.path.exists(pdf_path):
    print("File doesn't exist, downloading...")
    # The local filename to save the downloaded file
    filename = pdf_path
    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Open a file in binary write mode and save the content to it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"The file has been downloaded and saved as {filename}")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
  else:
    print(f"File {pdf_path} exists.")

In [5]:
pdf_path = "input.pdf"
url = "https://openreview.net/pdf/d469d2a0fc79717910f7475e53c4e589161debe3.pdf"

pdf_download(pdf_path, url)

File doesn't exist, downloading...
The file has been downloaded and saved as input.pdf


In [6]:
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    # Other potential text formatting functions can go here
    return cleaned_text

In [7]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 0,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token
                                "text": text})
    return pages_and_texts

In [8]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:1]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 2666,
  'page_word_count': 349,
  'page_sentence_count_raw': 13,
  'page_token_count': 666.5,
  'text': 'RAT: Retrieval Augmented Thoughts Elicit Context-Aware Reasoning and Verification in Long-Horizon Generation Zihao Wang Peking University zhwang@stu.pku.edu.cn Anji Liu University of California, Los Angeles liuanji@cs.ucla.edu Haowei Lin Peking University linhaowei@pku.edu.cn Jiaqi Li Beijing Institute of General Artificial Intelligence lijiaqi@bigai.cn Xiaojian Ma Beijing Institute of General Artificial Intelligence xiaojian.ma@ucla.edu Yitao Liang∗ Peking University yitaol@pku.edu.cn Abstract We explore how iterative revising a chain of thoughts with the help of information retrieval significantly improves large language models’ reasoning and generation ability in long-horizon generation tasks, while hugely mitigating hallucination. In particular, the proposed method — retrieval-augmented thoughts (RAT) — revises each thought step one by on

In [9]:
def sentence_converter(pages_and_texts: list[dict]) -> list[dict]:
  nlp = English()
  nlp.add_pipe("sentencizer")
  for item in tqdm(pages_and_texts):
      item["sentences"] = list(nlp(item["text"]).sents)
      # Make sure all sentences are strings
      item["sentences"] = [str(sentence) for sentence in item["sentences"]]
      # Count the sentences
      item["page_sentence_count_spacy"] = len(item["sentences"])

  return pages_and_texts

In [10]:
pages_and_texts_sen = sentence_converter(pages_and_texts)
del pages_and_texts

  0%|          | 0/24 [00:00<?, ?it/s]

In [11]:
# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

In [12]:
def chunking(pages_and_texts_sen: list[dict]) -> list[dict]:
# Define split size to turn groups of sentences into chunks
  num_sentence_chunk_size = 10
# Loop through pages and texts and split sentences into chunks
  for item in tqdm(pages_and_texts_sen):
      item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                          slice_size=num_sentence_chunk_size)
      item["num_chunks"] = len(item["sentence_chunks"])

  return pages_and_texts_sen

In [13]:
pages_and_texts_sen_chunk = chunking(pages_and_texts_sen)
del pages_and_texts_sen

  0%|          | 0/24 [00:00<?, ?it/s]

In [14]:
def convert_chunks(pages_and_texts_sen_chunk: list[dict]) -> list():
  # Split each chunk into its own item
  pages_and_chunks = []
  for item in tqdm(pages_and_texts_sen_chunk):
      for sentence_chunk in item["sentence_chunks"]:
          chunk_dict = {}
          chunk_dict["page_number"] = item["page_number"]

          # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
          joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
          joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
          chunk_dict["sentence_chunk"] = joined_sentence_chunk

          # Get stats about the chunk
          chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
          chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
          chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

          pages_and_chunks.append(chunk_dict)
  return pages_and_chunks

In [15]:
%%time
pages_and_chunks = convert_chunks(pages_and_texts_sen_chunk)

  0%|          | 0/24 [00:00<?, ?it/s]

CPU times: user 15.9 ms, sys: 964 µs, total: 16.8 ms
Wall time: 17.1 ms


In [16]:
model_path = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.float16).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

### Model Quantization

In [18]:
# from transformers import BitsAndBytesConfig

# bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

# model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     quantization_config=bnb_config,
#     device_map=device
# )

In [19]:
def generate_embedding(text:str):
  inputs = tokenizer(text, return_tensors="pt",padding=True ,truncation=True).to(device)

  # Forward pass through the model to get hidden states
  with torch.no_grad():
      outputs = model(**inputs, output_hidden_states=True)

  # Extract the hidden states
  hidden_states = outputs.hidden_states  # This is a tuple with the hidden states from all layers

  # Typically, the last hidden state is used as the embedding
  # hidden_states[-1] has the shape [batch_size, sequence_length, hidden_size]
  embedding = hidden_states[-1][:, 0, :]

  return embedding

### Embedding Generation (Sequential Processing)


In [20]:
def create_embeddings(pages_and_chunks: list):
  # Create embeddings one by one on the GPU
  for item in tqdm(pages_and_chunks):
      item["embedding"] = generate_embedding(item["sentence_chunk"])
  return pages_and_chunks

In [21]:
%%time
pages_and_chunks = create_embeddings(pages_and_chunks)

  0%|          | 0/87 [00:00<?, ?it/s]

CPU times: user 4.86 s, sys: 57.7 ms, total: 4.91 s
Wall time: 5.6 s


In [22]:
pages_and_chunks[0:2]

[{'page_number': 0,
  'sentence_chunk': 'RAT: Retrieval Augmented Thoughts Elicit Context-Aware Reasoning and Verification in Long-Horizon Generation Zihao Wang Peking University zhwang@stu.pku.edu.cn Anji Liu University of California, Los Angeles liuanji@cs.ucla.edu Haowei Lin Peking University linhaowei@pku.edu.cn Jiaqi Li Beijing Institute of General Artificial Intelligence lijiaqi@bigai.cn Xiaojian Ma Beijing Institute of General Artificial Intelligence xiaojian.ma@ucla.edu Yitao Liang∗ Peking University yitaol@pku.edu.cn Abstract We explore how iterative revising a chain of thoughts with the help of information retrieval significantly improves large language models’ reasoning and generation ability in long-horizon generation tasks, while hugely mitigating hallucination. In particular, the proposed method — retrieval-augmented thoughts (RAT) — revises each thought step one by one with retrieved information relevant to the task query, the current and the past thought steps, after th

### Embedding Generation (Batch Processing)

In [23]:
# text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]
# text_chunk_embeddings = generate_embedding(text_chunks)

In [24]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [25]:
embeddings = list()
for i in pages_and_chunks:
  embeddings.append(i['embedding'])
embeddings = torch.cat(embeddings, dim=0)

In [26]:
from sentence_transformers import util
query = "RAT"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = generate_embedding(query)

# 3. Get similarity scores with the dot product
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: RAT
Time take to get scores on 87 embeddings: 0.03435 seconds.


torch.return_types.topk(
values=tensor([8552., 8552., 8552., 8552., 8552.], device='cuda:0',
       dtype=torch.float16),
indices=tensor([1, 0, 2, 4, 3], device='cuda:0'))

In [27]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [28]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'RAT'

Results:
Score: 8552.0000
Text:
Our intuition is that the hallucination ∗Corresponding Author.38th Conference on
Neural Information Processing Systems (NeurIPS 2024).
Page number: 0


Score: 8552.0000
Text:
RAT: Retrieval Augmented Thoughts Elicit Context-Aware Reasoning and
Verification in Long-Horizon Generation Zihao Wang Peking University
zhwang@stu.pku.edu.cn Anji Liu University of California, Los Angeles
liuanji@cs.ucla.edu Haowei Lin Peking University linhaowei@pku.edu.cn Jiaqi Li
Beijing Institute of General Artificial Intelligence lijiaqi@bigai.cn Xiaojian
Ma Beijing Institute of General Artificial Intelligence xiaojian.ma@ucla.edu
Yitao Liang∗ Peking University yitaol@pku.edu.cn Abstract We explore how
iterative revising a chain of thoughts with the help of information retrieval
significantly improves large language models’ reasoning and generation ability
in long-horizon generation tasks, while hugely mitigating hallucination. In
particular, the proposed method

In [29]:
# 1. Define the query
query = "Our goal is to support long-horizon reasoning and generation while mitigating hallucination when using LLMs."
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = generate_embedding(query)

# 3. Get similarity scores with the cosine similarity
from time import perf_counter as timer

start_time = timer()
cosine_similarity = util.cos_sim(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_cos_sim = torch.topk(cosine_similarity, k=5)
top_results_cos_sim

Query: Our goal is to support long-horizon reasoning and generation while mitigating hallucination when using LLMs.
Time take to get scores on 87 embeddings: 0.07345 seconds.


torch.return_types.topk(
values=tensor([1., 1., 1., 1., 1.], device='cuda:0', dtype=torch.float16),
indices=tensor([2, 1, 3, 6, 5], device='cuda:0'))

In [30]:
print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_cos_sim[0], top_results_cos_sim[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'Our goal is to support long-horizon reasoning and generation while mitigating hallucination when using LLMs.'

Results:
Score: 1.0000
Text:
Step 0 Draft initial step-by-step zero-shot CoTs based on the task prompt. A
task prompt is given by a human user. LLM makes zero-shot step-by-step reasoning
based on the prompt. This initial zero-shot CoT answer may be ﬂawed. How to
obtain diamond sword in Minecraft?LLM Task Prompt (I) T1: Mine 4 planks (ﬂawed)
T2: craft table from planks ... Tn: Craft diamond sword Initial CoTs Retrieve
with the task prompt and previous generated CoTs. LLM revises the i-th steps in
thought chains (T1:i-1, Ti) based on the retrieved content. The thought chain
(T1:i-1, Ti) is replaced with the revised generation T1:i. T1* T2 T3 Tn ... T1*:
Mine 4 logs T2: craft table from planks ... Tn: Craft diamond sword Revised CoTs
Step 1 - Step n Step 1 Step n Retrieve relevant information and iteratively
revise each CoT with all previous generations in context. Retrie

In [31]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = generate_embedding(query)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)

    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [32]:
query = "What is RAT?"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

[INFO] Time taken to get scores on 87 embeddings: 0.00006 seconds.


(tensor([8552., 8552., 8552., 8552., 8552.], device='cuda:0',
        dtype=torch.float16),
 tensor([1, 0, 2, 4, 3], device='cuda:0'))

In [33]:
print_top_results_and_scores(query=query,
                             embeddings=embeddings)

[INFO] Time taken to get scores on 87 embeddings: 0.00014 seconds.
Query: What is RAT?

Results:
Score: 8552.0000
Our intuition is that the hallucination ∗Corresponding Author.38th Conference on
Neural Information Processing Systems (NeurIPS 2024).
Page number: 0


Score: 8552.0000
RAT: Retrieval Augmented Thoughts Elicit Context-Aware Reasoning and
Verification in Long-Horizon Generation Zihao Wang Peking University
zhwang@stu.pku.edu.cn Anji Liu University of California, Los Angeles
liuanji@cs.ucla.edu Haowei Lin Peking University linhaowei@pku.edu.cn Jiaqi Li
Beijing Institute of General Artificial Intelligence lijiaqi@bigai.cn Xiaojian
Ma Beijing Institute of General Artificial Intelligence xiaojian.ma@ucla.edu
Yitao Liang∗ Peking University yitaol@pku.edu.cn Abstract We explore how
iterative revising a chain of thoughts with the help of information retrieval
significantly improves large language models’ reasoning and generation ability
in long-horizon generation tasks, while hugel

In [34]:
from transformers.utils import is_flash_attn_2_available


In [35]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(model)

1777088000

In [36]:
def get_model_mem_size(model: torch.nn.Module):
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(model)

{'model_mem_bytes': 3554176256, 'model_mem_mb': 3389.53, 'model_mem_gb': 3.31}

In [37]:
input_text = '''
This is the portion of context of Draft Deed and Presented Deed and there are some discrepancies in it. Please check line by line. You need to find out the minor changes in it.

***Draft Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata- 700019,
at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On the
North : By Municipal Premises No 52/5 Ballygunge Circular Road.***

***Presented Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata700019, at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On
the North : By Municipal Premises No Nizam Palace Road.***

Find out what is the dicrepancies between them?'''
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:

This is the portion of context of Draft Deed and Presented Deed and there are some discrepancies in it. Please check line by line. You need to find out the minor changes in it.

***Draft Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata- 700019,
at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On the
North : By Municipal Premises No 52/5 Ballygunge Circular Road.***

***Presented Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata700019, at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On
the North : By Municipal Premises No Nizam Palace Road.***

Find out what is the dicrepancies between them?

Prompt (formatted):
<｜begin▁of▁sentence｜><｜User｜>
This is the portion of context of Draft Deed and Presented Deed and there are some discrepancies in it. Please check line by line. You need to find out the minor changes in it.

***Draft Deed: 42 and 42A, Sub

In [38]:
# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
outputs = model.generate(**input_ids,
                             max_new_tokens=1024) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Model input (tokenized):
{'input_ids': tensor([[151646, 151646, 151644,    198,   1986,    374,    279,  13348,    315,
           2266,    315,  28564,   1581,    291,    323,  87021,   1581,    291,
            323,   1052,    525,   1045,  90267,    304,    432,     13,   5209,
           1779,   1555,    553,   1555,     13,   1446,   1184,    311,   1477,
            700,    279,   8922,   4344,    304,    432,    382,  12210,  50086,
           1581,    291,     25,    220,     19,     17,    323,    220,     19,
             17,     32,     11,   3719,   9420,    344,   1816,    328,      3,
          14489,    220,     21,     11,  10082,  16629,    425,    745,  13259,
            709,     11,  81534,     12,    220,     22,     15,     15,     15,
             16,     24,    345,    266,  72834,   4360,     33,   5158,    276,
            573,    460,     11,  10942,   4882,    220,     17,     19,    393,
            858,  25908,    437,   1988,   6565,    323,    425,  1308

In [39]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: 
This is the portion of context of Draft Deed and Presented Deed and there are some discrepancies in it. Please check line by line. You need to find out the minor changes in it.

***Draft Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata- 700019,
at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On the
North : By Municipal Premises No 52/5 Ballygunge Circular Road.***

***Presented Deed: 42 and 42A, Sub-Division S$ Division 6, Police Station Ballygunge, Kolkata700019, at MouzaBhowanipore, District South 24 Parganasand Butted and Bounded as follows: On
the North : By Municipal Premises No Nizam Palace Road.***

Find out what is the dicrepancies between them?

Output text:
<｜begin▁of▁sentence｜>Okay, so I need to figure out the discrepancies between the Draft Deed and the Presented Deed. Let me read through both documents carefully.

First, the Draft Deed is: 42 and 42A, Sub-Division S$ Division 6, Police Station B

In [49]:
query_list = [
    "What is Retrieval Augmented Thoughts (RAT) and how does it work?",
    "What are the key differences between RAT and traditional Retrieval-Augmented Generation (RAG)?",
    "How does RAT improve long-horizon reasoning and generation tasks?",
    "What role does Chain-of-Thought (CoT) prompting play in RAT?",
    "What are the main advantages of RAT over vanilla CoT and RAG prompting?",
    "What are the different types of tasks RAT has been evaluated on?",
    "How does RAT improve code generation tasks, and what benchmarks are used to evaluate its performance?",
    "What improvements has RAT shown in mathematical reasoning tasks?",
    "How does RAT perform in embodied task planning, and what metrics are used to assess its effectiveness?",
    "What impact does RAT have on creative writing tasks?",
    "What are the limitations of RAT and how do they affect its performance on different models?",
    "How does RAT address the issue of hallucination in LLM reasoning?",
    "What is the iterative refinement process in RAT, and how does it contribute to its effectiveness?",
    "How do retrieval strategies in RAT influence its performance compared to baseline methods?",
    "What are the experimental setups used to evaluate RAT across different domains?",
    "What baseline methods are compared against RAT in the experiments?",
    "How does RAT utilize external knowledge sources to enhance reasoning?",
    "What results have been observed when applying RAT to different language models like GPT-3.5 and GPT-4?",
    "What are the ablation studies conducted on RAT, and what insights do they provide?",
    "How does RAT ensure causal reasoning during its iterative refinement process?",
    "What retrieval mechanisms does RAT use to obtain relevant information for each reasoning step?"
]


In [40]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.

\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

    # Update base prompt with context items and query
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [50]:
import random
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: What are the limitations of RAT and how do they affect its performance on different models?
[INFO] Time taken to get scores on 87 embeddings: 0.00010 seconds.
<｜begin▁of▁sentence｜><｜User｜>Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.


Now use the following context items to answer the user query:
- For embodied 2We used bigcode-evaluation as the tool library for code evaluation. The pass@1 result of DIRECT in the table is slightly different from the result in the bigcode leaderboard, because we tested our pass@1 five times in our original setup and calculated the average value. We used the same settings as DIRECT in all experiments and reported on the relative improvement of RAT compared to baselines to promise fair evaluation and comparison.6
- In contra

In [51]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True,
                             max_new_tokens=1024) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Query: What are the limitations of RAT and how do they affect its performance on different models?
RAG answer:
<｜begin▁of▁sentence｜>Alright, the user is asking about the limitations of RAT and how they affect its performance on different models. Let me look through the context to find relevant passages. 

First, the context mentions that RAT can automatically access external sources to validate and revise model outputs. However, there's a limitation here—it says RAT doesn't require human labels, which is good because it automates the validation. But I also see that there are experiments on long-horizon generation and reasoning where existing methods struggle. RAT shows significant success there. So RAT's ability to automatically validate without human labels is a limitation but also an advantage.

Another point is about the evaluation method. The context talks about evaluating open-ended planning in Minecraft, which focuses on both executability and plausibility. RAT uses MC-TextWorld 

In [46]:
def ask(query,
        temperature=0.7,
        max_new_tokens=1024,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

In [52]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")
context_items

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Query: How does RAT perform in embodied task planning, and what metrics are used to assess its effectiveness?
[INFO] Time taken to get scores on 87 embeddings: 0.00010 seconds.
Answer:

<｜begin▁of▁sentence｜>Okay, let me try to figure out how RAT performs in embodied
task planning based on the provided context.   First, I remember that RAT is a
system designed for validating and enhancing model outputs through a retrieval
process. The context mentions that RAT can automatically access external sources
to validate and revise model outputs, which helps it verify each step without
human labels. This is a big plus because it means RAT can work on its own
without needing human intervention, which is useful for tasks where human labels
might be scarce or difficult to obtain.  Looking at the context, there's a
mention of experiments comparing RAT to existing methods. The user mentioned
that existing methods struggle with long-horizon planning in Minecraft. So, this
suggests that RAT is particu

[{'page_number': 5,
  'sentence_chunk': 'For embodied 2We used bigcode-evaluation as the tool library for code evaluation. The pass@1 result of DIRECT in the table is slightly different from the result in the bigcode leaderboard, because we tested our pass@1 five times in our original setup and calculated the average value. We used the same settings as DIRECT in all experiments and reported on the relative improvement of RAT compared to baselines to promise fair evaluation and comparison.6',
  'chunk_char_count': 453,
  'chunk_word_count': 73,
  'chunk_token_count': 113.25,
  'embedding': tensor([[ 1.8252, -2.6387,  1.8193,  ...,  0.1469,  2.3555,  0.9458]],
         device='cuda:0', dtype=torch.float16),
  'score': tensor(8560., dtype=torch.float16)},
 {'page_number': 4,
  'sentence_chunk': 'In contrast, RAT can automatically access relevant information from external sources to validate and revise the content of model outputs through a retrieval process. This allows RAT to autonomousl