In [2]:
from abc import ABC, abstractmethod
from typing import Dict, List, Union, Any
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_nomic.embeddings import NomicEmbeddings
import json
from langchain_core.messages import HumanMessage, SystemMessage

### Database: Configuration

PGVector instance running on a local Docker container. Connection string uses psycopg adapter with specific port mapping (6024) to enable PostgreSQL connectivity in the local network.


In [3]:
connection = "postgresql+psycopg://langchain:langchain@192.168.0.53:6024/langchain"

### Database: Vector Store

Configures PGVector store with:
- Local GPU-accelerated embeddings via Nomic's text embedding model v1.5
- Vector storage in dedicated collection with JSONB metadata support
- Native pgvector extension for efficient similarity search operations

In [4]:
vector_store = PGVector(
    embeddings=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="gpu"),
    collection_name="database",
    connection=connection,
    use_jsonb=True,)

### Data: PDF Processing

Implements PDF document loading and text splitting:
- Load PDFs from 'data' directory using PyPDFDirectoryLoader
- Splits text into chunks using recursive character splitter
- Configured with 1000-character chunks and 200-character overlap for context preservation

In [4]:
loader = PyPDFDirectoryLoader(path="data")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs)

### Database: Document Indexing

Indexes processed document chunks into PGVector store for similarity search.

In [5]:
vector_store.add_documents(doc_splits)

['caaf18d0-287b-44b5-bf71-d66c510292bc',
 'fcafdaf3-fba2-447f-bbe5-e54eba3c9d11',
 '28eb1d19-5cc4-4f49-9dac-e0d31128017e',
 '2381aa29-d514-4106-8cb7-5ccdbdc08c86',
 '1ae4df8c-258f-4794-811d-4925714cbb88',
 '4a658baf-8d0d-47f4-bcd0-02d432379eae',
 '47cea0ba-fe4d-4569-9149-d496919a130a',
 'cea2dce4-7bb7-4379-b236-a63490eecd6a',
 '29640434-0dbb-4aff-b2d5-2cb5135a7ffd',
 'bf2934f8-7ec5-4b44-b224-11401d0455d7',
 '48cd55c2-f7c7-4a87-b0e6-0aa8b7834910',
 '109729e0-1d12-4516-9f57-3221ababc2b1',
 '7c6597b2-e357-4560-abe1-843b1de5bca1',
 '9a0dc9eb-4aa5-4c02-b193-5c4915ba7b91',
 'eb3d2e84-ee62-41e8-b29d-10dae2408f24',
 '3904e44a-8dda-45df-8a68-8daee6e7a4e2',
 'c64e44af-6a5f-475b-be58-9a8e8b0861ee',
 '75e6aeec-57a8-4aab-8c83-a6108570efe4',
 '6d3e1884-ea09-4111-9ad9-421989075245',
 '01f571de-11e1-49f8-9406-38f396aab389',
 '24d9f916-df05-401a-853d-0da894e8c890',
 '8128de4f-f3d1-4f09-84f5-0c7ac28fd432',
 '9a31ba3e-ae3b-4fc2-ba12-b8b7df660b35',
 'b4787352-0965-47b7-9cff-01f2761100bb',
 '5f7c8372-a613-

### Query: Retriver Setup

Configures vector store retriever to fetch 4 most relevant chunks per query

In [5]:
retriever = vector_store.as_retriever(k=3, search_kwargs={"k": 3})

### Model: Local LLM Configuration

Initializes Ollama model (llama3.2 3B instruct) in JSON output mode with:
- Base model: llama3.2 3B instruct (FP16)
- Temerature: 0 for deterministic outputs
- Format: JSON for structured responses

In [6]:
MODEL_NAME = "llama3.2:3b-instruct-fp16"
llm = ChatOllama(model=MODEL_NAME, temperature=0)
json_llm = ChatOllama(model=MODEL_NAME, temperature=0, format="json")

### Architecture: Base Evaluator Class

Abstract base class defining common methods and attributes for both graders and QA:
- Handles message construction and LLM invocation
- Supports both evaluation and generation tasks

In [7]:
class BaseEvaluator(ABC):
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def get_context(self, question: str) -> str:
        docs = self.retriever.invoke(question)
        return [doc.page_content for doc in docs]
    
    def evaluate(self, input_text: str, contexts: List[str] = None) -> Union[Dict, str]:
        if contexts is None:
            contexts = self.get_context(input_text) 
        return self._process_contexts(contexts, input_text)
    
    @abstractmethod
    def _process_contexts(self, contexts: List[str], input_text: str) -> Union[Dict, str]:
        """Process contexts according to evaluator type (grade/answer/etc)."""
        pass

### Evaluation: Relevance Grader

Implements grading logic to assess document relevance to queries:
- Returns binary relevance score for each context
- Aggregates multiple context scores into final assessment

In [20]:
class RelevanceGrader(BaseEvaluator):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever)
        self.system_prompt = """You are a grader assessing relevance of a retrieved context to a user question.
        If the context contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

        self.human_prompt_template = """Here is the retrieved context: \n\n {context} \n\n Here is the user question: \n\n {input_text}.

        Review carefully and objectively assess whether the context contains at least some information that is relevant to the question.

        Return JSON with 'binary_score': 'yes' if context has ANY connection to the topic (direct mentions, related technology, underlying concepts, or applications), 'no' ONLY if completely unrelated."""
        
    def _process_contexts(self, contexts: List[str], input_text: str) -> Dict:
        relevant_contexts = []
        scores = []
        for context in contexts:
            messages = [
                SystemMessage(content=self.system_prompt),
                HumanMessage(content=self.human_prompt_template.format(
                    context=context,
                    input_text=input_text
                ))
            ]
            result = self.llm.invoke(messages)
            is_relevant = json.loads(result.content)['binary_score'] == 'yes'
            scores.append(is_relevant)
            if is_relevant:
                relevant_contexts.append(context)

        return {
            "relevance_percentage": len(relevant_contexts) / len(contexts) * 100,
            "relevant_contexts": relevant_contexts
        }

### Model: Quesiton Answering

Implements context-aware QA using filtered relevant contexts to generate answers

In [17]:
class QuestionAnswering(BaseEvaluator):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever)
        self.prompt_template="""You are an assistant for question-answering tasks.
        
        Context:
        {context}
        
        Question:
        {input_text}
        
        Provide an answer using only the above context.
        
        Answer:"""

    def _process_contexts(self, contexts: List[str], input_text: str) -> str:
        if not contexts:
            return "No relevant context found to answer this question."
        
        combined_context = "\n\n".join(contexts)
        messages = [HumanMessage(content=self.prompt_template.format(
            context=combined_context,
            input_text=input_text
        ))]
        return self.llm.invoke(messages).content

### Evaluation: Hallucination Grader

Evaluates answers against contexts for factual accuracy:
- Binary grading system with detailed explanations
- Validates answer groundedness in provided contexts
- Identifies unsupported claims or hallucinated information

In [28]:
class HallucinationGrader(BaseEvaluator):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever)
        self.system_prompt="""You are a hallucination grader.
        You will be given actual contexts and an answer.
        Here is the grade criteria to follow:
        (1) Ensure the answer is grounded in the actual contexts.
        (2) Ensure the answer does not contain "hallucinated" information outside the scope of the actual contexts.
        Score:
        A score of yes means that the answer meets all of the criteria. This is the highest score.
        A score of no means that the answer does not meet all of the criteria. This is the lowest possible score. 
        Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 
        Avoid simply stating the correct answer at the outset.
        """
        self.human_prompt_template="""actual contexts: \n\n {contexts} \n\n answer: {input_text}
        Return JSON with two keys, binary_score is 'yes' or 'no' score to indicate whether the answer is grounded in the FACTS. And a key, explanation, that contains an explanation of the score."""

    def _process_contexts(self, contexts: List[str], input_text: str) -> str:
        combined_context = "\n\n".join(contexts)
        messages = [
            SystemMessage(content=self.system_prompt),
            HumanMessage(content=self.human_prompt_template.format(
                contexts=combined_context,
                input_text=input_text
            ))
        ]
        return json.loads(self.llm.invoke(messages).content)

### Pipeline: Question-Answer Flow

Pipeline integration for context-aware QA:
- Takes user question and processes through grader and QA modules
- RelevanceGrader filters and scores context relevance
- QuestionAnswering generates answers from relevant contexts
- HallucinationGrader validates answer factual accuracy.

In [30]:
question = "How does vision transformer work?"
relevance_grader = RelevanceGrader(json_llm, retriever)
qa = QuestionAnswering(llm, retriever)
hallucination_grader = HallucinationGrader(json_llm, retriever)

relevance_results = relevance_grader.evaluate(question)
answer = qa.evaluate(question, relevance_results["relevant_contexts"])
hallucination_results = hallucination_grader.evaluate(answer, relevance_results["relevant_contexts"])

print(f"Relevance: {relevance_results['relevance_percentage']}%")
print(f"Answer: {answer}")
print(f"Hallucination Check: {'Not hallucinated' if hallucination_results['binary_score']=="yes" else "Contains hallucination"}")
print(f"Explanation: {hallucination_results['explanation']}")

Relevance: 100.0%
Answer: Vision Transformer works by analyzing its internal representations. The first layer linearly projects flattened patches into a lower-dimensional space. A learned position embedding is then added to the patch representations, allowing the model to encode distance within the image in the similarity of position embeddings. This process is similar to how a standard Transformer encoder processes a sequence of tokens in NLP.
Hallucination Check: Not hallucinated
Explanation: The answer is grounded in the actual contexts provided because it accurately describes the internal representations of the Vision Transformer as analyzed in the text. Specifically, the first layer projects flattened patches into a lower-dimensional space (Eq. 1), and a learned position embedding is added to the patch representations, allowing the model to encode distance within the image in the similarity of position embeddings. This process is consistent with how a standard Transformer encoder 

In [31]:
docs = retriever.invoke(question)
print(f"Retrieved contexts: {len(docs)}")
for i, doc in enumerate(docs):
    print(f"\nContext {i+1}:")
    print(doc.page_content)

Retrieved contexts: 3

Context 1:
4.5 I NSPECTING VISION TRANSFORMER
Input
 Attention
Figure 6: Representative ex-
amples of attention from the
output token to the input
space. See Appendix D.7 for
details.
To begin to understand how the Vision Transformer processes im-
age data, we analyze its internal representations. The ﬁrst layer of
the Vision Transformer linearly projects the ﬂattened patches into a
lower-dimensional space (Eq. 1). Figure 7 (left) shows the top prin-
cipal components of the the learned embedding ﬁlters. The com-
ponents resemble plausible basis functions for a low-dimensional
representation of the ﬁne structure within each patch.
After the projection, a learned position embedding is added to the
patch representations. Figure 7 (center) shows that the model learns
to encode distance within the image in the similarity of position em-
beddings, i.e. closer patches tend to have more similar position em-
beddings. Further, the row-column structure appears; patches in 