In [None]:
#Install Requirements
!pip install reportlab



In [None]:
#Cell 1 — Create 8 Rich PDFs (2–3 Pages Each)

import os
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, ListFlowable, ListItem
from reportlab.lib.enums import TA_CENTER

BASE_DIR = "rag_papers"
os.makedirs(BASE_DIR, exist_ok=True)

def create_pdf(filepath, title, abstract, sections):
    doc = SimpleDocTemplate(
        filepath, pagesize=A4,
        rightMargin=36, leftMargin=36, topMargin=36, bottomMargin=36
    )
    styles = getSampleStyleSheet()

    title_style = ParagraphStyle(
        name="TitleStyle",
        parent=styles["Title"],
        alignment=TA_CENTER,
        fontSize=18,
        spaceAfter=14
    )
    header_style = ParagraphStyle(
        name="HeaderStyle",
        parent=styles["Heading2"],
        fontSize=13,
        spaceAfter=8,
        spaceBefore=14
    )
    body_style = ParagraphStyle(
        name="BodyStyle",
        parent=styles["BodyText"],
        fontSize=10.5,
        leading=14,
        spaceAfter=6
    )

    flow = []
    flow.append(Paragraph(title, title_style))
    flow.append(Spacer(1, 12))

    flow.append(Paragraph("<b>Abstract</b>", header_style))
    flow.append(Paragraph(abstract, body_style))
    flow.append(Spacer(1, 14))

    for sec_title, sec_text in sections:
        flow.append(Paragraph(sec_title, header_style))
        for para in sec_text.split("\n"):
            flow.append(Paragraph(para.strip(), body_style))
        flow.append(Spacer(1, 10))

    doc.build(flow)


In [None]:
#Cell 2 — Generate 8 Sector-Based Research PDFs
papers = [
    {
        "filename": "RAG_Foundations.pdf",
        "title": "Foundations of Retrieval-Augmented Generation (RAG)",
        "abstract": """Retrieval-Augmented Generation (RAG) combines information retrieval with neural text generation to overcome
the limitations of parametric memory in large language models. This paper presents a comprehensive study of RAG,
including its architecture, document preprocessing strategies, indexing mechanisms, and answer generation pipelines.""",
        "sections": [
            ("1. Introduction",
             "Large Language Models (LLMs) rely primarily on internal parameters for knowledge storage. While powerful, "
             "this paradigm limits their ability to provide up-to-date and verifiable information.\n"
             "Retrieval-Augmented Generation introduces an external knowledge source that can be dynamically queried.\n"
             "This approach enables factual grounding, reduces hallucination, and supports domain-specific question answering."),
            ("2. Architecture of RAG",
             "A typical RAG pipeline consists of five stages: document ingestion, text chunking, embedding generation, "
             "vector indexing, and answer synthesis.\n"
             "Each component contributes to retrieval accuracy and response relevance."),
            ("3. Interaction Between Components",
             "The user query is embedded and compared against document embeddings stored in a vector index.\n"
             "Top-k relevant chunks are retrieved and injected into the prompt for the LLM.\n"
             "The LLM generates an answer strictly grounded in the retrieved context."),
            ("4. Benefits and Limitations",
             "RAG improves explainability and factual consistency but depends heavily on retrieval quality.\n"
             "Suboptimal chunking or embedding strategies can lead to missing or misleading evidence.")
        ]
    },
    {
        "filename": "Transformer_Architecture.pdf",
        "title": "Transformer Architecture: Encoder, Decoder, and Sub-Layers",
        "abstract": """The Transformer model forms the backbone of modern NLP systems. This paper explains the architecture,
including encoder and decoder stacks, self-attention mechanisms, and feed-forward sub-layers.""",
        "sections": [
            ("1. Overview",
             "The Transformer replaces recurrent and convolutional layers with self-attention.\n"
             "This enables parallel processing and improved modeling of long-range dependencies."),
            ("2. Encoder Layer",
             "Each encoder layer consists of two sub-layers: (1) Multi-Head Self-Attention and (2) Position-wise Feed-Forward Network.\n"
             "Residual connections and layer normalization are applied around each sub-layer."),
            ("3. Decoder Layer",
             "The decoder contains three sub-layers: masked self-attention, encoder-decoder attention, and feed-forward networks.\n"
             "Masking ensures autoregressive behavior during training and inference."),
            ("4. Training and Inference",
             "Transformers allow efficient batch training.\n"
             "During inference, tokens are generated sequentially in an autoregressive manner.")
        ]
    },
    {
        "filename": "Positional_Encoding.pdf",
        "title": "Positional Encoding in Transformer Models",
        "abstract": """Since Transformers do not use recurrence, they require positional encodings to preserve word order.
This paper explores sinusoidal and learned positional encoding techniques.""",
        "sections": [
            ("1. Motivation",
             "Self-attention mechanisms are permutation-invariant.\n"
             "Without positional information, word order is lost."),
            ("2. Sinusoidal Encoding",
             "Sinusoidal encodings use periodic functions of different frequencies to represent token positions.\n"
             "They enable extrapolation to longer sequences."),
            ("3. Learned Position Embeddings",
             "Position embeddings can also be learned during training.\n"
             "While flexible, they may generalize poorly to unseen sequence lengths."),
            ("4. Importance",
             "Positional encoding allows the model to learn syntax, sequence order, and hierarchical structures in language.")
        ]
    },
    {
        "filename": "Multi_Head_Attention.pdf",
        "title": "Multi-Head Attention in Transformer Architecture",
        "abstract": """Multi-head attention enhances self-attention by enabling the model to attend to multiple representation
subspaces simultaneously. This paper explains its design and benefits.""",
        "sections": [
            ("1. Self-Attention Recap",
             "Self-attention computes interactions between all tokens in a sequence."),
            ("2. Multi-Head Mechanism",
             "Multiple attention heads operate on different linear projections of the input.\n"
             "Outputs are concatenated and projected."),
            ("3. Advantages",
             "Allows the model to capture syntactic, semantic, and long-range dependencies simultaneously."),
            ("4. Applications",
             "Used in machine translation, summarization, and large-scale language modeling.")
        ]
    },
    {
        "filename": "Few_Shot_Learning.pdf",
        "title": "Few-Shot Learning in Large Language Models",
        "abstract": """Few-shot learning enables LLMs to perform tasks using only a small number of examples.
This paper explains in-context learning and its advantages.""",
        "sections": [
            ("1. Definition",
             "Few-shot learning refers to task generalization from minimal examples provided at inference time."),
            ("2. In-Context Learning",
             "LLMs infer patterns directly from examples embedded in the prompt."),
            ("3. Benefits",
             "Eliminates the need for task-specific fine-tuning."),
            ("4. Challenges",
             "Performance depends on prompt design and context length.")
        ]
    },
    {
        "filename": "GPT3_Inference.pdf",
        "title": "Inference Strategies in GPT-3",
        "abstract": """GPT-3 popularized in-context learning and prompt-based inference. This paper describes decoding strategies
and few-shot prompting mechanisms.""",
        "sections": [
            ("1. Autoregressive Generation",
             "GPT-3 generates text token-by-token conditioned on previous context."),
            ("2. Prompt Engineering",
             "Tasks are described using natural language instructions and examples."),
            ("3. Sampling Techniques",
             "Top-k, nucleus sampling, and temperature control output diversity."),
            ("4. Few-Shot Inference",
             "Multiple examples are embedded in the prompt to guide generation.")
        ]
    },
    {
        "filename": "Vector_Search_and_ANN.pdf",
        "title": "Vector Search and Approximate Nearest Neighbor Algorithms",
        "abstract": """This paper discusses vector similarity search, indexing structures, and approximate nearest neighbor (ANN)
algorithms such as FAISS and HNSW.""",
        "sections": [
            ("1. Vector Representations",
             "Text is mapped into high-dimensional embedding spaces."),
            ("2. Exact vs Approximate Search",
             "Exact search is computationally expensive.\n"
             "ANN methods trade small accuracy loss for large performance gains."),
            ("3. FAISS and HNSW",
             "FAISS uses clustering and quantization.\n"
             "HNSW builds navigable small-world graphs."),
            ("4. Applications",
             "Semantic search, recommendation systems, and RAG pipelines.")
        ]
    },
    {
        "filename": "Hybrid_Retrieval.pdf",
        "title": "Hybrid Retrieval: Combining Dense and Sparse Search",
        "abstract": """Hybrid retrieval combines semantic embeddings with keyword-based search to improve recall and precision.
This paper explains hybrid ranking strategies in information retrieval.""",
        "sections": [
            ("1. Sparse Retrieval",
             "Keyword-based methods such as BM25 rely on term frequency and inverse document frequency."),
            ("2. Dense Retrieval",
             "Embedding-based methods capture semantic similarity."),
            ("3. Hybrid Methods",
             "Scores from dense and sparse retrieval are combined using weighted ranking."),
            ("4. Use in RAG",
             "Hybrid search improves retrieval robustness for complex user queries.")
        ]
    }
]

for paper in papers:
    path = os.path.join(BASE_DIR, paper["filename"])
    create_pdf(path, paper["title"], paper["abstract"], paper["sections"])
    print(f"Created: {path}")


Created: rag_papers/RAG_Foundations.pdf
Created: rag_papers/Transformer_Architecture.pdf
Created: rag_papers/Positional_Encoding.pdf
Created: rag_papers/Multi_Head_Attention.pdf
Created: rag_papers/Few_Shot_Learning.pdf
Created: rag_papers/GPT3_Inference.pdf
Created: rag_papers/Vector_Search_and_ANN.pdf
Created: rag_papers/Hybrid_Retrieval.pdf


In [None]:
# PART 2 — End-to-End RAG System Using Groq
!pip install langchain faiss-cpu sentence-transformers pypdf scikit-learn groq





In [None]:
!pip install -U langchain langchain-community langchain-text-splitters




In [None]:
#Cell 3 — Load PDFs
from pypdf import PdfReader

def load_pdfs(pdf_dir):
    documents = []
    for fname in os.listdir(pdf_dir):
        if fname.endswith(".pdf"):
            path = os.path.join(pdf_dir, fname)
            reader = PdfReader(path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            documents.append({"source": fname, "text": text})
    return documents

docs = load_pdfs("rag_papers")
print("Loaded documents:", len(docs))


Loaded documents: 8


In [None]:
#Cell 4 — Chunking Strategy
#from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=120
)

chunks = []
for doc in docs:
    pieces = splitter.split_text(doc["text"])
    for i, p in enumerate(pieces):
        chunks.append({
            "source": doc["source"],
            "chunk_id": i,
            "text": p
        })

print("Total chunks:", len(chunks))




Total chunks: 12


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [c["text"] for c in chunks]
embeddings = embedding_model.encode(texts, show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("Total vectors indexed:", index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Total vectors indexed: 12


This step converts all text chunks into numerical vectors and stores them in a FAISS index so that similar content can be retrieved efficiently.

Cell for keyword based retrieval and hybrid search

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(texts)

def hybrid_search(query, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)

    query_tfidf = tfidf.transform([query])
    keyword_scores = (tfidf_matrix @ query_tfidf.T).toarray().ravel()

    combined_indices = set(indices[0])
    top_keywords = np.argsort(keyword_scores)[-top_k:]
    combined_indices = combined_indices.union(set(top_keywords))

    results = []
    for idx in combined_indices:
        dense_score = 0
        if idx in indices[0]:
            dense_score = -distances[0][list(indices[0]).index(idx)]
        sparse_score = keyword_scores[idx]
        final_score = dense_score + sparse_score
        results.append((idx, final_score))

    results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
    return results


In [None]:
import faiss
import pickle

faiss.write_index(index, "rag_vector_index.faiss")

with open("rag_metadata.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("Vector database saved to disk")




# import faiss
# import pickle

# index = faiss.read_index("rag_vector_index.faiss")

# with open("rag_metadata.pkl", "rb") as f:
#     chunks = pickle.load(f)

# print("Vector database loaded successfully")



Vector database saved to disk


This step allows both semantic search using embeddings and keyword matching using TF IDF.

Cell for GORQ LLM integration using llama-3.1-8b-instant

In [None]:
from groq import Groq

client = Groq(api_key="")

In [None]:
def generate_answer(query):
    retrieved_chunks = hybrid_search(query, top_k=5)

    context_text = ""
    sources = []

    for idx, _ in retrieved_chunks:
        context_text += chunks[idx]["text"] + "\n\n"
        sources.append(chunks[idx]["source"])

    prompt = f"""
You are an AI assistant. Answer the question using only the context below.
Do not use external knowledge.

Context:
{context_text}

Question:
{query}

Answer:
"""

    completion = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )

    answer = completion.choices[0].message.content
    return answer, sources


In [None]:
questions = [
    "What are the main components of a RAG model and how do they interact?",
    "What are the two sub layers in each encoder layer of the Transformer?",
    "Explain how positional encoding is implemented and why it is necessary",
    "Describe multi head attention and why it is beneficial"

]

for q in questions:
    answer, sources = generate_answer(q)
    print("Question:", q)
    print("Answer:", answer)
    print("Sources:", sources)
    print()


Question: What are the main components of a RAG model and how do they interact?
Answer: According to the text, a typical RAG pipeline consists of five stages: 

1. Document ingestion
2. Text chunking
3. Embedding generation
4. Vector indexing
5. Answer synthesis

The user query is embedded and compared against document embeddings stored in a vector index. The top-k relevant chunks are retrieved and injected into the prompt for the LLM. The LLM then generates an answer strictly grounded in the retrieved context.
Sources: ['Multi_Head_Attention.pdf', 'RAG_Foundations.pdf', 'RAG_Foundations.pdf', 'RAG_Foundations.pdf', 'Positional_Encoding.pdf']

Question: What are the two sub layers in each encoder layer of the Transformer?
Answer: The two sub-layers in each encoder layer of the Transformer are:

1. Multi-Head Self-Attention
2. Position-wise Feed-Forward Network.
Sources: ['Few_Shot_Learning.pdf', 'Transformer_Architecture.pdf', 'Transformer_Architecture.pdf', 'Positional_Encoding.pdf', 

In [None]:
while True:
    user_query = input("Enter your question or type exit to stop: ")

    if user_query.lower() == "exit":
        print("Session ended")
        break

    answer, sources = generate_answer(user_query)

    print("\nAnswer:\n", answer)
    print("\nSources:\n", sources)
    print("\n" + "-"*80 + "\n")


Enter your question or type exit to stop: What is few shot learning and how does GPT three implement it during inference

Answer:
 Few-shot learning refers to task generalization from minimal examples provided at inference time. It enables Large Language Models (LLMs) to perform tasks using only a small number of examples.

GPT-3 implements few-shot learning through in-context learning, where the model infers patterns directly from examples embedded in the prompt. This approach eliminates the need for task-specific fine-tuning. During inference, GPT-3 uses prompt engineering to describe tasks using natural language instructions and examples, and then generates text token-by-token conditioned on previous context. The model also employs sampling techniques, such as top-k, nucleus sampling, and temperature control, to output diverse responses.

Sources:
 ['Multi_Head_Attention.pdf', 'Transformer_Architecture.pdf', 'GPT3_Inference.pdf', 'Few_Shot_Learning.pdf', 'Transformer_Architecture.pd