In [2]:
import os
import json
import faiss
from sentence_transformers import SentenceTransformer

# Paths
DOCS_DIR = r"C:\Users\Lenovo\Ai_farmer_query_based\data\kb_docs"       # folder containing your .txt files
INDEX_PATH = "data/faiss_index.idx"
META_PATH = "data/faiss_meta.json"
DOCS_PATH = "data/docs.json"

# Make sure the folder exists
os.makedirs("data", exist_ok=True)

# Load embedding model
sbert = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Step 1: Load docs
docs = []
meta = []
for fname in os.listdir(DOCS_DIR):
    if fname.endswith(".txt"):
        path = os.path.join(DOCS_DIR, fname)
        with open(path, "r", encoding="utf8") as f:
            text = f.read()

        docs.append(text)
        meta.append({"source": fname})

# Step 2: Convert docs to embeddings
embeddings = sbert.encode(docs, convert_to_numpy=True, normalize_embeddings=True)

# Step 3: Build FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatIP(d)   # cosine similarity
index.add(embeddings)

# Step 4: Save index + metadata
faiss.write_index(index, INDEX_PATH)

with open(META_PATH, "w", encoding="utf8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

with open(DOCS_PATH, "w", encoding="utf8") as f:
    json.dump(docs, f, indent=2, ensure_ascii=False)

print("✅ FAISS index built and saved!")


✅ FAISS index built and saved!


In [7]:
import os
import json
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI

# ---- Load FAISS index & docs ----
INDEX_PATH = "data/faiss_index.idx"
META_PATH = "data/faiss_meta.json"
DOCS_PATH = "data/docs.json"   # pre-saved list of chunks

# ✅ Dhenu API client
client = OpenAI(
    api_key="dh-OJFkGcV-Iejz4PPjyGNy5w7DAJrmE7jlDQ-3dKwh7lo",  # your Dhenu API key
    base_url="https://api.dhenu.ai/v1"  # Dhenu API base URL
)

# Embedding model
sbert = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
index = faiss.read_index(INDEX_PATH)

with open(META_PATH, 'r', encoding='utf8') as f:
    meta = json.load(f)

with open(DOCS_PATH, 'r', encoding='utf8') as f:
    docs = json.load(f)


# ---- Retrieval function ----
def retrieve_docs(query, k=5):
    q_emb = sbert.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    return [(docs[i], meta[i]) for i in I[0]]


# ---- Retrieval with filtering ----
def search(query, top_k=10):
    q_emb = sbert.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)

    results = []
    for i, score in zip(I[0], D[0]):
        text, meta_info = docs[i], meta[i]
        # keep only relevant passages by keyword filtering
        if any(word in text.lower() for word in query.lower().split()):
            results.append((text, meta_info))

    # fallback: if filtering removes everything, keep top-3 anyway
    if not results:
        results = [(docs[i], meta[i]) for i in I[0][:3]]

    return results[:5]  # limit to 5 best matches


# ---- Prompt construction (farmer-friendly) ----
def build_prompt(question, passages):
    context = "\n\n".join([txt for txt, _ in passages])
    prompt = f"""
You are a practical agricultural advisor for Indian farmers. 
Base your answer ONLY on the following knowledge base. 
Ignore irrelevant or off-topic information. 
Do not mention files, sources, or documents.

Knowledge Base:
{context}

Farmer's Question: {question}

Answer:
- Give step-by-step advice in simple language
- Focus only on the farmer’s question
- Keep it practical, natural, and directly useful
"""
    return prompt


# ---- Answer generation ----
def generate_answer(question):
    # ✅ Retrieve fewer docs
    passages = search(question, top_k=3)  # pick top 3, not all
    prompt = build_prompt(question, passages)

    # Debug prompt length
    print("Prompt length (chars):", len(prompt))

    response = client.chat.completions.create(
        model="dhenu2-in-8b-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.4,
        max_tokens=100
    )

    # Safe extraction
    answer = None
    try:
        if response and hasattr(response, "choices") and response.choices:
            choice = response.choices[0]
            if hasattr(choice, "message") and choice.message:
                answer = choice.message.get("content", "")
            elif hasattr(choice, "text") and choice.text:
                answer = choice.text
            else:
                answer = str(response)
        else:
            answer = str(response)
    except Exception as e:
        answer = f"[Error extracting answer: {e}]"

    return {"answer": answer}




In [8]:
# ---- Quick test ----
if __name__ == "__main__":
    result = generate_answer("How to improve soil fertility naturally?")
    print("Answer:", result["answer"])

Prompt length (chars): 108381
Answer: ChatCompletion(id=None, choices=None, created=None, model=None, object='error', service_tier=None, system_fingerprint=None, usage=None, message="This model's maximum context length is 8192 tokens. However, you requested 43585 tokens (43485 in the messages, 100 in the completion). Please reduce the length of the messages or completion. None", type='BadRequestError', param=None, code=400)


In [27]:
# ---- Quick test ----
if __name__ == "__main__":
    result = generate_answer("now in this season it is good to crop wheat?")
    print("Answer:", result["answer"])

Answer: [Error extracting answer: 'NoneType' object is not subscriptable]
