# **Phase 0: Setting Up**

In [1]:
import sys
print(sys.executable)

/Users/ivantan/Desktop/rag-demo/.venv/bin/python


In [2]:
import os
from pathlib import Path
from typing import List, Dict, Any, Optional
import logging
from datetime import datetime
from dotenv import load_dotenv # Environment and configuration

# LangChain core
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, AIMessage

# Document loaders
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader,
)

from langchain_community.vectorstores import Chroma # Vector store
from langchain_google_genai import ChatGoogleGenerativeAI # LLM
from rank_bm25 import BM25Okapi # BM25 for hybrid search
import tiktoken # Token counting

print("Core libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Core libraries imported successfully!
Timestamp: 2026-02-08 18:06:36


In [3]:
# This helps track the RAG pipeline execution and identify bottlenecks
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

logger = logging.getLogger('IndustrialRAG')

# Test logging
logger.info("Logging system initialized")
logger.info("Starting Industrial RAG System Development")
print("Logging configured successfully!")

2026-02-08 18:06:36 - IndustrialRAG - INFO - Logging system initialized
2026-02-08 18:06:36 - IndustrialRAG - INFO - Starting Industrial RAG System Development


Logging configured successfully!


In [4]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in .env file!")

# Mask the key for security when logging
masked_key = f"{GOOGLE_API_KEY[:8]}...{GOOGLE_API_KEY[-4:]}"
logger.info(f"API Key loaded: {masked_key}")

print(f"Google API Key loaded successfully!")
print(f"Masked Key: {masked_key}")

2026-02-08 18:06:36 - IndustrialRAG - INFO - API Key loaded: AIzaSyAU...LT-U


Google API Key loaded successfully!
Masked Key: AIzaSyAU...LT-U


In [5]:
# Uses tiktoken (OpenAI's tokenizer) as a proxy for token estimation
def count_tokens(text: str, encoding_name: str = "cl100k_base") -> int:
    """
    Count the number of tokens in a text string.
    """
    try:
        encoding = tiktoken.get_encoding(encoding_name)
        tokens = encoding.encode(text)
        return len(tokens)
    except Exception as e:
        logger.error(f"Error counting tokens: {e}")
        # Fallback: rough estimation (1 token ≈ 4 characters)
        return len(text) // 4

def analyze_document(text: str) -> Dict[str, Any]:
    """   
    Strategy:
    - Short docs (<2000 tokens): chunk_size=500, overlap=100
    - Medium docs (2000-10000 tokens): chunk_size=1000, overlap=200
    - Long docs (>10000 tokens): chunk_size=1500, overlap=300
    Returns: Dictionary with token count and suggested chunk parameters
    """
    token_count = count_tokens(text)
    word_count = len(text.split())
    char_count = len(text)
    
    # Determine optimal chunk size based on document length
    if token_count < 2000:
        chunk_size = 500
        chunk_overlap = 100
        strategy = "small"
    elif token_count < 10000:
        chunk_size = 1000
        chunk_overlap = 200
        strategy = "medium"
    else:
        chunk_size = 1500
        chunk_overlap = 300
        strategy = "large"
    
    analysis = {
        "token_count": token_count,
        "word_count": word_count,
        "char_count": char_count,
        "suggested_chunk_size": chunk_size,
        "suggested_overlap": chunk_overlap,
        "strategy": strategy
    }
    
    logger.info(f"Document analysis: {token_count} tokens, strategy={strategy}")
    
    return analysis

In [6]:
# Test the token counter
test_text = """
Artificial intelligence (AI) is transforming the world at an unprecedented pace.
Machine learning, a subset of AI, enables computers to learn from data without
explicit programming. Deep learning, using neural networks, has achieved remarkable
results in computer vision, natural language processing, and speech recognition.
"""

analysis = analyze_document(test_text)

print("Token counter utility created!")
print(f"\nTest Document Analysis:")
print(f"    Tokens: {analysis['token_count']}")
print(f"    Words: {analysis['word_count']}")
print(f"    Characters: {analysis['char_count']}")
print(f"    Suggested chunk size: {analysis['suggested_chunk_size']}")
print(f"    Suggested overlap: {analysis['suggested_overlap']}")
print(f"    Strategy: {analysis['strategy']}")

2026-02-08 18:06:36 - IndustrialRAG - INFO - Document analysis: 59 tokens, strategy=small


Token counter utility created!

Test Document Analysis:
    Tokens: 59
    Words: 44
    Characters: 326
    Suggested chunk size: 500
    Suggested overlap: 100
    Strategy: small


# **Phase 2: Document Processing Pipeline.**

In [7]:
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader, WebBaseLoader

def load(source: str) -> List[Document]:
    """Auto-detect and load file or directory using LangChain loaders."""
    
    if source.startswith(("http://", "https://")):
        loader = WebBaseLoader(source)
        return loader.load()
    
    source_path = Path(source)
    
    if source_path.is_file():
        loader = UnstructuredFileLoader(str(source_path))
        return loader.load()
        
    elif source_path.is_dir():
        loader = DirectoryLoader(
            str(source_path),
            glob="**/*",
            loader_cls=UnstructuredFileLoader,
            use_multithreading=True
        )
        return loader.load()
    
    else:
        raise ValueError(f"Invalid source: {source}")



In [12]:
dir_doc = load("test_documents")
web_doc = load("https://karpathy.ai/")
all_doc = dir_doc + web_doc





# **Phase 3: Chunking and Embeddings**

In [15]:
# Phase 3: Chunking - Using LangChain's RecursiveCharacterTextSplitter

from langchain_text_splitters import RecursiveCharacterTextSplitter

def create_chunks(documents: List[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.
    Tries to split on paragraphs, then sentences, maintaining semantic coherence.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", ";", " ", ""],
        is_separator_regex=False,
    )
    
    chunks = text_splitter.split_documents(documents)
    logger.info(f"Split {len(documents)} documents into {len(chunks)} chunks")
    
    return chunks

In [21]:
# Chunk all documents
all_chunks = create_chunks(all_doc)
print(f"Created {len(all_chunks)} chunks from {len(all_doc)} documents\n")
print("\nFirst 100 chars in the first doc:\n", all_chunks[0].page_content[:100])
print("\nFirst 100 chars in the second doc:\n", all_chunks[1].page_content[:100])

2026-02-08 22:56:57 - IndustrialRAG - INFO - Split 4 documents into 113 chunks


Created 113 chunks from 4 documents


First 100 chars in the first doc:
 # Retrieval

Augmented Generation (RAG) Systems

## Introduction Retrieval-Augmented Generation (RAG

First 100 chars in the second doc:
 5. **Query Processing**: When a user asks a question, it's converted into an embedding using the sam


In [22]:
# Phase 3: Embeddings - Using HuggingFace sentence-transformers

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'mps'},  # Use Apple Silicon GPU
    encode_kwargs={'normalize_embeddings': True}
)

logger.info("Embeddings model initialized: all-MiniLM-L6-v2")
print("Embeddings model ready (384 dimensions)")


2026-02-08 23:11:59 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2026-02-08 23:11:59 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-08 23:11:59 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
2026-02-08 23:11:59 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
2026-02-08 23:11:59 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json "HTTP/1.1 200 OK"
2026-02-08 23:12:00 - h

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
2026-02-08 23:12:01 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
2026-02-08 23:12:01 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json "HTTP/1.1 200 OK"
2026-02-08 23:12:01 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
2026-02-08 23:12:02 - httpx - INFO - HTTP Request: HEAD https://huggingface.co/api/re

Embeddings model ready (384 dimensions)


In [32]:
# Vector Store - Index all chunks in ChromaDB

from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=all_chunks,
    embedding=embeddings,
    collection_name="rag_knowledge_base"
)

logger.info(f"Vector store created with {len(all_chunks)} chunks")
print(f"Vector store indexed: {len(all_chunks)} chunks ready for retrieval")

2026-02-08 23:20:09 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2026-02-08 23:20:11 - IndustrialRAG - INFO - Vector store created with 113 chunks


Vector store indexed: 113 chunks ready for retrieval


# **Phase 4: Retrieval**

In [34]:
# Phase 4: Retrieval - Create retriever from vector store

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}  # Retrieve top 10 candidates for re-ranking
)

logger.info("Retriever created with k=10")
print("Retriever ready: will fetch top 10 candidates")

2026-02-08 23:31:04 - IndustrialRAG - INFO - Retriever created with k=10


Retriever ready: will fetch top 10 candidates


## Retrieval Test

In [41]:
# Test retrieval with a query
test_query = "What are the benefits of RAG systems?"
results = retriever.invoke(test_query)

print(f"Query: '{test_query}'")
print(f"Retrieved {len(results)} documents\n")

for i, doc in enumerate(results[:3], 1):
    print(f"**Result {i}:**")
    print(f"{doc.page_content[:150]}...")
    print(f"Source: {doc.metadata.get('source', 'N/A')}\n")

Query: 'What are the benefits of RAG systems?'
Retrieved 10 documents

**Result 1:**
9

Broader Impact

This work offers several positive societal beneﬁts over previous work: the fact that it is more strongly grounded in real factual k...
Source: test_documents/rag-for-knowledge-intensive-nlp-tasks.pdf

**Result 2:**
. Like T5 [51] or BART, RAG can be ﬁne-tuned on any seq2seq task, whereby both the generator and retriever are jointly learned....
Source: test_documents/rag-for-knowledge-intensive-nlp-tasks.pdf

**Result 3:**
Acknowledgments

The authors would like to thank the reviewers for their thoughtful and constructive feedback on this paper, as well as HuggingFace fo...
Source: test_documents/rag-for-knowledge-intensive-nlp-tasks.pdf



## Rerank