In [1]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

load_dotenv()
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

print(" Phase 5 environment loaded")
print(f"   Project root: {project_root}")

 Phase 5 environment loaded
   Project root: /Users/kaushik003/Documents/projects/agri-chatbot


In [2]:
import os
import sys
import json
import numpy as np
from pathlib import Path
from dotenv import load_dotenv
from typing import List, Dict, Any
from tqdm.auto import tqdm

# Load environment
load_dotenv()
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

In [3]:
# Vector Store & Embeddings
from pinecone import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings

# Keyword Search
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize

# Re-ranking
from sentence_transformers import CrossEncoder

# Data Processing
import pypdf
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('punkt_tab')

print(" Libraries loaded")

 Libraries loaded


### 1. Load Configurations & Initialize Models

In [4]:
# Load Phase 1 & 2 Configs
with open(project_root / "phase1_config.json", 'r') as f: phase1_config = json.load(f)
with open(project_root / "phase2_config.json", 'r') as f: phase2_config = json.load(f)

print(f" Loaded Configs: Disease Index='{phase2_config['disease_index']}'")

# 1. Initialize Pinecone (Semantic Search)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
disease_index = pc.Index(phase2_config['disease_index'])
scheme_index = pc.Index(phase2_config['scheme_index'])

# 2. Initialize Embeddings (for query encoding)
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# 3. Initialize Cross-Encoder (for Re-ranking)
# using a lightweight but effective model
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

print(" All models initialized")

 Loaded Configs: Disease Index='agri-chatbot-disease'


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

 All models initialized


### 2. Build In-Memory BM25 Index (Keyword Search)
Since BM25 requires statistical data about term frequency across the whole corpus, we need to reconstruct the chunks exactly as we did in Phase 1.

In [5]:
def load_and_chunk(pdf_path: Path, doc_type: str, index_name: str):
    """Re-create chunks to match Pinecone IDs"""
    # 1. Load PDF
    reader = pypdf.PdfReader(pdf_path)
    text_data = ""
    for page in reader.pages:
        text_data += page.extract_text() + "\n\n"
    
    # 2. Split (Using Phase 1 Params)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=phase1_config['chunking']['size'],
        chunk_overlap=phase1_config['chunking']['overlap'],
        length_function=len
    )
    chunks = splitter.split_text(text_data)
    
    # 3. Create Document Objects with IDs matching Phase 2
    documents = []
    for i, content in enumerate(chunks):
        # Phase 2 ID format: {index_name}_{i}
        doc_id = f"{index_name}_{i}"
        documents.append({
            "id": doc_id,
            "content": content,
            "metadata": {"source": pdf_path.name, "doc_type": doc_type}
        })
    return documents

print(" Reconstructing corpus for BM25...")
DATA_DIR = project_root / "data"

# Re-load Disease Docs
disease_docs = load_and_chunk(
    DATA_DIR / "CitrusPlantPestsAndDiseases.pdf", 
    "disease", 
    phase2_config['disease_index']
)

# Re-load Scheme Docs
scheme_docs = load_and_chunk(
    DATA_DIR / "GovernmentSchemes.pdf", 
    "scheme", 
    phase2_config['scheme_index']
)

print(f" Reconstructed {len(disease_docs)} disease chunks")
print(f" Reconstructed {len(scheme_docs)} scheme chunks")

 Reconstructing corpus for BM25...
 Reconstructed 297 disease chunks
 Reconstructed 132 scheme chunks


In [6]:
def build_bm25(documents):
    """Tokenize corpus and build BM25 index"""
    tokenized_corpus = [word_tokenize(doc['content'].lower()) for doc in documents]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25

print(" Building BM25 Indices...")
bm25_disease = build_bm25(disease_docs)
bm25_scheme = build_bm25(scheme_docs)
print("BM25 Indices ready")

 Building BM25 Indices...
BM25 Indices ready


### 3. Define Retrieval Functions
We need functions for Vector Search, Keyword Search, and the Fusion logic.

In [7]:
def vector_search(query: str, index, top_k: int = 5) -> List[Dict]:
    """Semantic search using Pinecone"""
    query_emb = embeddings_model.embed_query(query)
    
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True
    )
    
    # Normalize structure
    hits = []
    for match in results.matches:
        hits.append({
            "id": match.id,
            "content": match.metadata.get("text", ""),
            "score": match.score,
            "metadata": match.metadata,
            "method": "semantic"
        })
    return hits

print("Function `vector_search` ready")

Function `vector_search` ready


In [8]:
def keyword_search(query: str, bm25_index, documents: List[Dict], top_k: int = 5) -> List[Dict]:
    """Keyword search using BM25"""
    tokenized_query = word_tokenize(query.lower())
    
    # Get scores
    doc_scores = bm25_index.get_scores(tokenized_query)
    
    # Get top_k indices
    top_indices = np.argsort(doc_scores)[::-1][:top_k]
    
    hits = []
    for idx in top_indices:
        # Ignore zero scores (no keyword match)
        if doc_scores[idx] > 0:
            doc = documents[idx]
            hits.append({
                "id": doc["id"],
                "content": doc["content"],
                "score": float(doc_scores[idx]), # Use raw BM25 score
                "metadata": doc["metadata"],
                "method": "keyword"
            })
    return hits

print("Function `keyword_search` ready")

Function `keyword_search` ready


### 4. Reciprocal Rank Fusion (RRF)
RRF is a robust method to combine two ranked lists without worrying about the different scales of the scores (Cosine Similarity vs BM25).
Formula: `score = 1 / (rank + k)`

In [9]:
def reciprocal_rank_fusion(semantic_results: List[Dict], keyword_results: List[Dict], k: int = 60):
    """Combine results using RRF"""
    scores = {}
    doc_map = {} # Keep content and metadata
    
    # Process Semantic Results
    for rank, hit in enumerate(semantic_results):
        doc_id = hit['id']
        if doc_id not in doc_map:
            doc_map[doc_id] = hit
        scores[doc_id] = scores.get(doc_id, 0) + (1 / (rank + k))
        
    # Process Keyword Results
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['id']
        if doc_id not in doc_map:
            doc_map[doc_id] = hit
        # Add to existing score or start new
        scores[doc_id] = scores.get(doc_id, 0) + (1 / (rank + k))
    
    # Sort by fused score
    sorted_ids = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    # Reconstruct final list
    fused_results = []
    for doc_id, score in sorted_ids:
        doc = doc_map[doc_id]
        doc['score'] = score
        doc['method'] = 'hybrid'
        fused_results.append(doc)
        
    return fused_results

print("Function `reciprocal_rank_fusion` ready")

Function `reciprocal_rank_fusion` ready


### 5. Re-ranking with Cross-Encoder
Semantic search and BM25 are "Retrievers". They are fast but not always accurate at deep understanding. 
A Cross-Encoder is a "Re-ranker". It takes (Query, Document) pairs and outputs a precise similarity score (0 to 1). It is slower, so we only use it on the top ~10 results from fusion.

In [10]:
def rerank_results(query: str, results: List[Dict], top_k: int = 5) -> List[Dict]:
    """Re-rank documents using Cross-Encoder"""
    if not results:
        return []
    
    # Prepare pairs for model
    pairs = [[query, doc['content']] for doc in results]
    
    # Predict scores
    scores = reranker.predict(pairs)
    
    # Attach new scores and sort
    for i, doc in enumerate(results):
        doc['rerank_score'] = float(scores[i])
        
    # Sort by re-rank score
    reranked = sorted(results, key=lambda x: x['rerank_score'], reverse=True)
    
    return reranked[:top_k]

print("Function `rerank_results` ready")

Function `rerank_results` ready


### 6. Unified Retrieval Logic
Now we combine everything into a single function that handles the logic:
1. Identify which index to query (Disease/Scheme/Hybrid)
2. Retrieve Semantic + Keyword
3. Fuse
4. Re-rank

In [11]:
def get_relevant_documents(query: str, intent: str, top_k: int = 5) -> List[Dict]:
    """
    Full pipeline: 
    Routing -> Semantic+Keyword Search -> Fusion -> Re-ranking
    """
    final_results = []
    
    # 1. Routing Logic
    search_disease = intent in ["disease", "hybrid", "unclear"]
    search_scheme = intent in ["scheme", "hybrid", "unclear"]
    
    retrieved_docs = []
    
    # 2a. Disease Search
    if search_disease:
        # Semantic
        sem_docs = vector_search(query, disease_index, top_k=10)
        # Keyword
        kw_docs = keyword_search(query, bm25_disease, disease_docs, top_k=10)
        # Fuse
        retrieved_docs.extend(reciprocal_rank_fusion(sem_docs, kw_docs))
        
    # 2b. Scheme Search
    if search_scheme:
        # Semantic
        sem_docs = vector_search(query, scheme_index, top_k=10)
        # Keyword
        kw_docs = keyword_search(query, bm25_scheme, scheme_docs, top_k=10)
        # Fuse
        retrieved_docs.extend(reciprocal_rank_fusion(sem_docs, kw_docs))
    
    # 3. Deduplicate (if hybrid fetched same doc via different paths - unlikely but safe)
    unique_docs = {doc['id']: doc for doc in retrieved_docs}.values()
    
    # 4. Re-rank
    final_results = rerank_results(query, list(unique_docs), top_k=top_k)
    
    return final_results

print("Advanced Retrieval Pipeline Ready!")

Advanced Retrieval Pipeline Ready!


### 7. Testing & Evaluation
Let's test with queries that require specific keywords (chemicals) and conceptual understanding (subsidies).

In [12]:
test_queries = [
    # Specific Chemical (Keyword heavy)
    {"query": "How to use Imidacloprid for citrus?", "intent": "disease"},
    
    # Conceptual (Semantic heavy)
    {"query": "Is there money help for water systems?", "intent": "scheme"},
    
    # Hybrid
    {"query": "Subsidies for treating root rot", "intent": "hybrid"}
]

print("Running Tests...\n")

for item in test_queries:
    q = item['query']
    intent = item['intent']
    
    print(f"❓ Query: '{q}' (Intent: {intent})")
    print("-" * 60)
    
    results = get_relevant_documents(q, intent, top_k=3)
    
    for i, res in enumerate(results, 1):
        # Snippet for display
        snippet = res['content'][:150].replace('\n', ' ')
        print(f"{i}. Score: {res['rerank_score']:.4f} | Source: {res['metadata'].get('source', 'Unknown')}")
        print(f"   Excerpt: {snippet}...")
    print("\n")

Running Tests...

❓ Query: 'How to use Imidacloprid for citrus?' (Intent: disease)
------------------------------------------------------------
1. Score: 4.8238 | Source: CitrusPlantPestsAndDiseases.pdf
   Excerpt: introductions: Quarantine and inspect new citrus plants for scales to avoid introducing a new  species or population.  Chemical Control: - Systemic in...
2. Score: 3.4653 | Source: CitrusPlantPestsAndDiseases.pdf
   Excerpt: larvae are protected inside leaves, one must either kill the adults or use a systemic that gets inside  leaf. - Systemic insecticides (neonicotinoids)...
3. Score: 3.4653 | Source: CitrusPlantPestsAndDiseases.pdf
   Excerpt: larvae are protected inside leaves, one must either kill the adults or use a systemic that gets inside  leaf. - Systemic insecticides (neonicotinoids)...


❓ Query: 'Is there money help for water systems?' (Intent: scheme)
------------------------------------------------------------
1. Score: -9.5027 | Source: GovernmentSchemes.pdf
 

In [13]:
phase5_config = {
    "retrieval_strategy": "Hybrid (RRF) + Re-ranking",
    "bm25_enabled": True,
    "cross_encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "rrf_k": 60,
    "created_at": "2026-01-05T06:00:00.000000"
}

config_path = project_root / "phase5_config.json"
with open(config_path, 'w') as f:
    json.dump(phase5_config, f, indent=2)

print("Phase 5 Configuration saved")

Phase 5 Configuration saved
