In [1]:
from abc import ABC, abstractmethod
from typing import Dict, List, Union, Any
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_nomic.embeddings import NomicEmbeddings
import json
from langchain_core.messages import HumanMessage, SystemMessage

### Database: Configuration

PGVector instance running on a local Docker container. Connection string uses psycopg adapter with specific port mapping (6024) to enable PostgreSQL connectivity in the local network.


In [2]:
connection = "postgresql+psycopg://langchain:langchain@192.168.0.53:6024/langchain"

### Database: Vector Store

Configures PGVector store with:
- Local GPU-accelerated embeddings via Nomic's text embedding model v1.5
- Vector storage in dedicated collection with JSONB metadata support
- Native pgvector extension for efficient similarity search operations

In [3]:
vector_store = PGVector(
    embeddings=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="gpu"),
    collection_name="database",
    connection=connection,
    use_jsonb=True,
)

### Data: PDF Processing

Implements PDF document loading and text splitting:
- Load PDFs from 'data' directory using PyPDFDirectoryLoader
- Splits text into chunks using recursive character splitter
- Configured with 1000-character chunks and 200-character overlap for context preservation

In [6]:
loader = PyPDFDirectoryLoader(path="data")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs)

### Database: Document Indexing

Indexes processed document chunks into PGVector store for similarity search.

In [7]:
vector_store.add_documents(doc_splits)

['8bbd3057-6754-427e-a6e7-6123c393ca84',
 '05b2edb7-5694-4cbf-9f4b-427650c08790',
 'f1501148-25ca-4a4c-aefc-75158ad43c8b',
 'a853c1cd-0600-42ea-8f86-d6652f211b79',
 '93b32e8b-ef8a-41bb-8320-d6e492f54272',
 'dea3c043-dc52-4ade-ae8c-bdafdc321a5c',
 '18a2dfb2-bd51-49af-86b2-2fa9807bda05',
 'adfaa762-6089-46b8-8675-723daf6aefdb',
 '79b57ae3-b515-409b-9a0b-90fec4182395',
 'b17a1a2b-d4a5-4500-a0f0-9fbc2774ab12',
 '2a56c7e2-087f-454a-86e9-ecabbd6aae58',
 '9a17520b-74d7-435b-a105-d86a030dd544',
 '1006b453-6d49-4348-bdee-c904a1d12625',
 '4275c020-1986-47eb-8ac0-857ed90e5423',
 'f6dd7502-2a7f-434b-ab16-6afbdc9f449c',
 '78926eb8-ba6a-44c3-9e3d-efae89622830',
 'dd94c690-bede-4e86-a57a-f12ee50ecbeb',
 'ccb6985e-a4e0-430a-be56-0d859918b61b',
 '5cc5c4c2-6beb-414b-9bb3-54a85f398c84',
 '91069952-5bba-4c59-a97b-af84707dc885',
 'aa8bc416-8bec-4d88-9883-a5de8d13ced9',
 'c3efd145-10db-4015-b534-ab7614e4af45',
 'e31ab55c-1782-418f-a97f-b2784f9d1da7',
 'd18e918f-1ddc-4af7-a2ec-e63d4f39615e',
 '4ec7fe12-9a74-

### Query: Retriver Setup

Configures vector store retriever to fetch 4 most relevant chunks per query

In [4]:
retriever = vector_store.as_retriever(k=3, search_kwargs={"k": 3})

### Model: Local LLM Configuration

Initializes Ollama model (llama3.2 3B instruct) in JSON output mode with:
- Base model: llama3.2 3B instruct (FP16)
- Temerature: 0 for deterministic outputs
- Format: JSON for structured responses

In [5]:
MODEL_NAME = "llama3.2:3b-instruct-fp16"
llm = ChatOllama(model=MODEL_NAME, temperature=0)
json_llm = ChatOllama(model=MODEL_NAME, temperature=0, format="json")

### Architecture: Base Evaluator Class

Abstract base class defining common methods and attributes for both graders and QA:
- Handles message construction and LLM invocation
- Supports both evaluation and generation tasks

In [6]:
class BaseEvaluator(ABC):
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def get_context(self, question: str) -> str:
        docs = self.retriever.invoke(question)
        return [doc.page_content for doc in docs]
    
    def evaluate(self, question: str, contexts: List[str] = None) -> Union[Dict, str]:
        if contexts is None:
            contexts = self.get_context(question) 
        return self._process_contexts(contexts, question)
    
    @abstractmethod
    def _process_contexts(self, contexts: List[str], question: str) -> Union[Dict, str]:
        """Process contexts according to evaluator type (grade/answer/etc)."""
        pass

### Evaluation: Relevance Grader

Implements grading logic to assess document relevance to queries:
- Returns binary relevance score for each context
- Aggregates multiple context scores into final assessment

In [7]:
class RelevanceGrader(BaseEvaluator):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever)
        self.system_prompt = """Grade document relevance with high recall: approve if there's any connection to the topic."""

        self.human_prompt_template = """Document: {context}

        Question: {question} 

        Return JSON with 'binary_score': 'yes' if document has ANY connection to the topic (direct mentions, related technology, underlying concepts, or applications), 'no' ONLY if completely unrelated."""
        
    def _process_contexts(self, contexts: List[str], question: str) -> Dict:
        relevant_contexts = []
        scores = []
        for context in contexts:
            messages = [
                SystemMessage(content=self.system_prompt),
                HumanMessage(content=self.human_prompt_template.format(
                    context=context,
                    question=question
                ))
            ]
            result = self.llm.invoke(messages)
            is_relevant = json.loads(result.content)['binary_score'] == 'yes'
            scores.append(is_relevant)
            if is_relevant:
                relevant_contexts.append(context)

        return {
            "relevance_percentage": len(relevant_contexts) / len(contexts) * 100,
            "relevant_contexts": relevant_contexts
        }

### Model: Quesiton Answering

Implements context-aware QA using filtered relevant contexts to generate answers

In [8]:
class QuestionAnswering(BaseEvaluator):
    def __init__(self, llm, retriever):
        super().__init__(llm, retriever)
        self.prompt_template="""You are an assistant for question-answering tasks.
        
        Context:
        {context}
        
        Question:
        {question}
        
        Provide a concise answer using only the above context.
        
        Answer:"""

    def _process_contexts(self, contexts: List[str], question: str) -> str:
        if not contexts:
            return "No relevant context found to answer this question."
        
        combined_context = "\n\n".join(contexts)
        messages = [HumanMessage(content=self.prompt_template.format(
            context=combined_context,
            question=question
        ))]
        return self.llm.invoke(messages).content

### Pipeline: Question-Answer Flow

Pipeline integration for context-aware QA:
- Takes user question and processes through grader and QA modules
- RelevanceGrader filters contexts by relevance score
- QuestionAnswering generates concise answer using filtered contexts

In [9]:
question = "What is audio transformer?"
grader = RelevanceGrader(json_llm, retriever)
qa = QuestionAnswering(llm, retriever)

relevance_results = grader.evaluate(question)
answer = qa.evaluate(question, relevance_results["relevant_contexts"])

print(f"Relevance: {relevance_results['relevance_percentage']}%")
print(f"Answer: {answer}")

Relevance: 66.66666666666666%
Answer: Audio Transformer refers to a multi-stage Transformer-based language model operating on proposed tokens in the AudioLM framework.


In [10]:
docs = retriever.invoke(question)
print(f"Retrieved contexts: {len(docs)}")
for i, doc in enumerate(docs):
    print(f"\nContext {i+1}:")
    print(doc.page_content)

Retrieved contexts: 3

Context 1:
A. Components
We consider a single channel audio sequence x ∈ RT , which
is processed by the following three components of the AudioLM
framework:
• A tokenizer model, which maps x into a sequence h =
enc(x), h = (h1, . . . , hT′ ) of discrete tokens from a finite
vocabulary, with T′ ≪ T.
• A decoder-only Transformer language model that operates
on the discrete tokens y, trained to maximize the likelihoodQT′
t=1 p(ht|h<t). At inference time, the model predicts
the token sequence ˆh autoregressively.
• A detokenizer model, which maps the sequence of
predicted tokens back to audio, producing the waveform
ˆx = dec(ˆh).
It is important to emphasize the following aspects: i) the
number of tokens T′ is typically 2-3 orders of magnitude
smaller than T. This is critical to significantly increase
the temporal context size of the language model, since the
computational complexity of standard self-attention grows
quadratically with respect to the sequence length; 