# RAG Demo

In [1]:
import numpy as np
import pandas as pd

import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 

In [2]:
from sentence_transformers import SentenceTransformer

from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline

2025-07-03 07:42:10.431497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751528530.608481      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751528530.657897      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!pip install faiss-cpu sentence-transformers transformers accelerate

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metad

In [4]:
# RAG with FAISS + Open-Source 7B LLM on Kaggle

In [5]:
import faiss

In [6]:
# Load and chunk document
def chunk_text(text, max_words=200, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i : i + max_words]
        chunks.append(" ".join(chunk))
        i += max_words - overlap
    return chunks

with open('/kaggle/input/posttrainingforragdemo/Post_TrainingForRagTest.txt', 'r') as f:
    text = f.read()
passages = chunk_text(text)

In [7]:
# Build FAISS index
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(passages, convert_to_numpy=True)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Retrieval function
def retrieve(question, k=3):
    q_emb = embedder.encode([question], convert_to_numpy=True)
    distances, indices = index.search(q_emb, k)
    return [passages[i] for i in indices[0]]

In [9]:
# Load 7B open-source LLM
model_name = 'tiiuae/falcon-7b-instruct'  # or another 7B model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map='auto'
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [10]:
generator = TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id
)

Device set to use cuda:0


In [11]:
# QA function using RAG
def answer(question, k=3, max_length=256):
    ctxs = retrieve(question, k)
    prompt = (
        "Use ONLY the following context to answer the question.\n" +
        "\n".join(f"Context {i+1}: {c}" for i,c in enumerate(ctxs)) +
        f"\nQuestion: {question}\nAnswer:"
    )
    input_ids = tokenizer(prompt, truncation=True, return_tensors='pt')['input_ids']
    outputs = generator(
        prompt,
        max_length=input_ids.shape[1] + max_length,
        truncation=True,
        do_sample=False
    )
    # extract answer portion
    generated = outputs[0]['generated_text']
    return generated[len(prompt):].strip()

In [12]:
# Not use RAG 
def direct_answer(question, max_length=256):
    prompt = f"""Question: {question}
Answer:"""
    input_ids = tokenizer(prompt, truncation=True, return_tensors='pt')['input_ids']
    outputs = generator(
        prompt,
        max_length=input_ids.shape[1] + max_length,
        truncation=True,
        do_sample=False
    )
    return outputs[0]['generated_text'][len(prompt):].strip()

In [13]:
# Example 

q = "What is the main benefit of Post-training?"
print("With RAG:\n", answer(q))
print("Without RAG:\n", direct_answer(q)) 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With RAG:
 The main benefit of Post-training is to enhance the models' utility across a diverse range of applications.
Without RAG:
 The main benefit of Post-training is to help participants apply the skills and knowledge they have learned during the training to their work environment.


In [14]:
q = "Is post-training always necessary? Why or why not?"
print("With RAG:\n", answer(q))
print("Without RAG:\n", direct_answer(q)) 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With RAG:
 Post-training is not always necessary. While it is a common practice in the field of natural language processing, it is not always necessary for achieving the desired performance. The decision to use post-training depends on the specific task and the nature of the language model being used. In some cases, post-training may be necessary to achieve the desired performance, while in other cases, it may not be necessary.
Without RAG:
 It depends on the individual and the type of training they are doing. Post-training is often necessary for athletes who are doing high-intensity or high-volume workouts, as it can help with muscle recovery and preventing injury. However, for those who are doing lower intensity workouts, post-training may not be as necessary.


In [15]:
q = "What are the primary goals and objectives of post-training activities?"
print("With RAG:\n", answer(q))
print("Without RAG:\n", direct_answer(q)) 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With RAG:
 The primary goals and objectives of post-training activities are to enhance the models' utility across a diverse range of applications, ensure instruction following, and achieve stylistic compliance.
Without RAG:
 The primary goals and objectives of post-training activities are to ensure that participants have fully understood the training concepts and have the necessary skills and knowledge to apply them in their work. Additionally, post-training activities are designed to help participants develop a plan for implementing the skills and knowledge they have acquired during the training.


In [16]:
q = "What are some common techniques or methods used in post-training? (e.g., quantization, pruning, calibration, distillation, fine-tuning, bias mitigation, explainability, interpretability)"
print("With RAG:\n", answer(q))
print("Without RAG:\n", direct_answer(q)) 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With RAG:
 Some common techniques or methods used in post-training include: 1) quantization to reduce the size of the model, 2) pruning to remove unnecessary parameters, 3) calibration to ensure the model is not overfitting, 4) distillation to extract the most important parameters, 5) fine-tuning to adapt the model to specific tasks, 6) bias mitigation to reduce the impact of bias, 7) interpretability to explain the model's behavior, and 8) explainability to identify the most important parameters.
Without RAG:
 Some common techniques or methods used in post-training include quantization, which involves converting large integers to smaller floating-point numbers to reduce memory usage and improve performance; pruning, which involves removing unnecessary features or layers from a model to reduce its complexity and improve accuracy; calibration, which involves adjusting hyperparameters to ensure that a model is performing as expected; distillation, which involves converting a complex mode

In [17]:
q = "How does post-training contribute to model compression and efficiency?"
print("With RAG:\n", answer(q))
print("Without RAG:\n", direct_answer(q)) 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

With RAG:
 Post-training contributes to model compression and efficiency by fine-tuning the model on a diverse range of datasets, ensuring that the model is able to adapt to different contexts and tasks. This allows the model to be more efficient and accurate in its predictions, as it is able to learn from a wider range of data and contexts. Additionally, post-training can also be used to reduce the computational complexity of the model, as it allows the model to learn from a larger dataset and fine-tune its parameters to a specific task.
Without RAG:
 Post-training contributes to model compression and efficiency by reducing the number of parameters in the model. This is achieved by combining the learned parameters from the training process, which can significantly reduce the number of parameters required to represent the model. This reduction in parameters can lead to significant compression of the model, which can improve the efficiency of the model in terms of computational resource