In [None]:
import os
import json
import networkx as nx
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

# ============================================================
# CONFIG
# ============================================================
load_dotenv()
JSON_PATH = "D:/gov-scheme-assistant-updated/threetry/schemes.json"
DB_DIR = "rag_db"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# ============================================================
# STEP 1 ‚Äî LOAD JSON
# ============================================================
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

docs = []
for entry in data:
    kb = entry["knowledge_base_entry"]
    text_parts = [f"Scheme: {kb.get('scheme','')}", f"Summary: {kb.get('summary','')}"]

    for section in ["key_information", "all_extracted_sections"]:
        section_data = kb.get(section, {})
        if isinstance(section_data, dict):
            for key, val in section_data.items():
                if isinstance(val, list):
                    text_parts.extend(val)
                elif isinstance(val, str):
                    text_parts.append(val)

    full_text = "\n".join(text_parts).strip()
    if full_text:
        docs.append(Document(page_content=full_text,
                             metadata={"scheme": kb.get("scheme", "Unknown")}))

print(f"‚úÖ Loaded {len(docs)} documents.")

# ============================================================
# STEP 2 ‚Äî SPLIT TEXT & BUILD VECTOR STORE
# ============================================================
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(docs)
print(f"üìö Created {len(chunks)} chunks.")

embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectordb = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=DB_DIR)
vectordb.persist()
print("‚úÖ Vector database built successfully!")

# ============================================================
# STEP 3 ‚Äî BUILD KNOWLEDGE GRAPH
# ============================================================
G = nx.DiGraph()

for entry in data:
    kb = entry["knowledge_base_entry"]
    scheme = kb.get("scheme", "Unknown")
    G.add_node(scheme, type="scheme")

    key_info = kb.get("key_information", {})
    for key, val in key_info.items():
        if isinstance(val, list):
            for v in val:
                node_name = v.strip()
                if node_name:
                    G.add_node(node_name, type=key)
                    G.add_edge(scheme, node_name, relation=key)
        elif isinstance(val, str):
            node_name = val.strip()
            if node_name:
                G.add_node(node_name, type=key)
                G.add_edge(scheme, node_name, relation=key)

print(f"üï∏ Graph built with {len(G.nodes)} nodes and {len(G.edges)} edges.")




In [None]:
# ============================================================
# STEP 4 ‚Äî RETRIEVAL FUNCTIONS
# ============================================================
def retrieve_from_vector(query, k=10):
    retriever = vectordb.as_retriever(search_kwargs={"k": k})
    results = retriever.get_relevant_documents(query)
    return "\n\n".join([r.page_content for r in results])

def query_graph(G, query):
    """Return list of schemes connected to matching attribute nodes."""
    keywords = query.lower().split()
    matching_nodes = [n for n in G.nodes if any(k in n.lower() for k in keywords)]
    related_schemes = set()
    for node in matching_nodes:
        for pred in G.predecessors(node):
            if G.nodes[pred].get("type") == "scheme":
                related_schemes.add(pred)
    return list(related_schemes)

def hybrid_retrieve(query, G, vectordb):
    """Graph ‚Üí scheme filter ‚Üí detailed vector context."""
    graph_schemes = query_graph(G, query)
    graph_context = "\n".join(graph_schemes)
    if not graph_context:
        print("‚ö† No graph match found ‚Äî falling back to pure vector retrieval.")
        return retrieve_from_vector(query)
    return retrieve_from_vector(graph_context)

# ============================================================
# STEP 5 ‚Äî LLM CALLER (Gemini / Grok)
# ============================================================
# def generate_with_llm(query, context, llm_choice="gemini"):
def generate_with_llm(query, context, llm_choice="gemini"):

    prompt = f"Use the context below to answer the query.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"

    if llm_choice.lower() == "grok":
        import requests
        response = requests.post(
            "https://api.x.ai/v1/grok/completions",
            headers={"Authorization": f"Bearer {os.getenv('GROK_API_KEY')}"},
            json={"prompt": prompt, "max_tokens": 300}
        )
        return response.json().get("text", "")

    elif llm_choice.lower() == "gemini":
        import google.generativeai as genai
        genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
        model = genai.GenerativeModel("gemini-2.0-flash-001")
        result = model.generate_content(prompt)
        return result.text

    else:
        return "‚ùå Invalid LLM choice (use 'grok' or 'gemini')."


In [None]:
# ============================================================
# STEP 6 ‚Äî MAIN EXECUTION
# ============================================================
if _name_ == "_main_":
    user_query = "I am a girl doing B.Tech, family income 2 lakh, need scholarship"

    print("\nüîé Retrieving context using Graph + Vector RAG ...")
    context = hybrid_retrieve(user_query, G, vectordb)

    print("\nü§ñ Generating final answer from LLM ...")
    answer = generate_with_llm(user_query, context, llm_choice="gemini")

    print("\n=== üß† FINAL ANSWER ===")
    print(answer)

In [None]:
def check_eligibility(client, eligibility_data):
    eligible_schemes = []

    for scheme, criteria in eligibility_data.items():
        eligible = True

        # Age check
        age_min = criteria.get("age_min")
        age_max = criteria.get("age_max")
        if age_min is not None and client.get("age") is not None and client["age"] < age_min:
            eligible = False
        if age_max is not None and client.get("age") is not None and client["age"] > age_max:
            eligible = False

        # Gender check
        scheme_gender = criteria.get("gender")
        client_gender = client.get("gender")
        if scheme_gender and client_gender and client_gender.lower() != scheme_gender.lower():
            eligible = False

        # Income check
        income_max = criteria.get("income_max")
        client_income = client.get("income")
        if income_max is not None and client_income is not None and client_income > income_max:
            eligible = False

        # Education check
        scheme_edu = criteria.get("education")
        client_edu = client.get("education")
        if scheme_edu and client_edu and scheme_edu.lower() not in client_edu.lower():
            eligible = False

        # Other conditions (if any)
        for cond in criteria.get("other_conditions", []):
            if cond not in client.get("other_conditions", []):
                eligible = False

        if eligible:
            eligible_schemes.append(scheme)

    return eligible_schemes

In [None]:
import json
import os
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv

# Load keys from .env file if available
load_dotenv()

# --------------------------
# CONFIGURATION
# --------------------------
JSON_PATH = "D:/gov-scheme-assistant-updated/threetry/schemes.json"
DB_DIR = "rag_db"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # lightweight & fast

# --------------------------
# STEP 1: LOAD AND PARSE JSON
# --------------------------
with open(JSON_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

docs = []
eligibility_data = {}

for entry in data:
    kb = entry["knowledge_base_entry"]
    text_parts = []

    # Main fields
    text_parts.append(f"Scheme: {kb.get('scheme', '')}")
    text_parts.append(f"Summary: {kb.get('summary', '')}")

    # Flatten nested fields (key_information, all_extracted_sections, etc.)
    for section in ["key_information", "all_extracted_sections"]:
        section_data = kb.get(section, {})
        if isinstance(section_data, dict):
            for key, value in section_data.items():
                if isinstance(value, list):
                    text_parts.extend(value)
                elif isinstance(value, str):
                    text_parts.append(value)

    # Combine all text
    full_text = "\n".join(text_parts).strip()

    # Create Document for embedding
    if full_text:
        docs.append(Document(page_content=full_text, metadata={"scheme": kb.get("scheme", "Unknown")}))

    # Extract structured eligibility info (handle missing fields)
    key_info = kb.get("key_information", {})
    eligibility_data[kb.get("scheme", "Unknown")] = {
        "age_min": key_info.get("age_min"),
        "age_max": key_info.get("age_max"),
        "gender": key_info.get("gender"),
        "income_max": key_info.get("income_max"),
        "education": key_info.get("education"),
        "other_conditions": key_info.get("other_conditions", [])
    }

print(f"Loaded {len(docs)} documents and prepared eligibility data.")

# --------------------------
# STEP 2: SPLIT TEXT
# --------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_documents(docs)
print(f"Created {len(chunks)} text chunks for embedding.")

# --------------------------
# STEP 3: CREATE EMBEDDINGS AND VECTOR STORE
# --------------------------
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vectordb = Chroma.from_documents(chunks, embedding=embeddings, persist_directory=DB_DIR)
vectordb.persist()
print("‚úÖ Vector database built successfully!")

# --------------------------
# STEP 4: RETRIEVAL FUNCTION
# --------------------------
def retrieve_context(query, k=10):
    retriever = vectordb.as_retriever(search_kwargs={"k": k})
    results = retriever.get_relevant_documents(query)
    context = "\n\n".join([r.page_content for r in results])
    return context

# --------------------------
# STEP 5: LLM BACKENDS (GROK / GEMINI)
# --------------------------
def generate_with_llm(query, context, llm_choice="grok"):
    prompt = f"Use the context below to answer the query.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"

    if llm_choice.lower() == "grok":
        import requests
        response = requests.post(
            "https://api.x.ai/v1/grok/completions",
            headers={"Authorization": f"Bearer {os.getenv('GROK_API_KEY')}"},
            json={"prompt": prompt, "max_tokens": 300}
        )
        return response.json().get("text", "")

    elif llm_choice.lower() == "gemini":
        from google.generativeai import GenerativeModel
        import google.generativeai as genai
        genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
        model = genai.GenerativeModel("gemini-2.0-flash-001")
        response = model.generate_content(prompt)
        return response.text

    else:
        return "‚ùå Invalid LLM choice. Use 'grok' or 'gemini'."

# --------------------------
# STEP 6: ELIGIBILITY CHECK
# --------------------------
def check_eligibility(client, eligibility_data):
    eligible_schemes = []

    for scheme, criteria in eligibility_data.items():
        eligible = True

        # Age check
        age_min = criteria.get("age_min")
        age_max = criteria.get("age_max")
        if age_min is not None and client.get("age") is not None and client["age"] < age_min:
            eligible = False
        if age_max is not None and client.get("age") is not None and client["age"] > age_max:
            eligible = False

        # Gender check
        scheme_gender = criteria.get("gender")
        client_gender = client.get("gender")
        if scheme_gender and client_gender and client_gender.lower() != scheme_gender.lower():
            eligible = False

        # Income check
        income_max = criteria.get("income_max")
        client_income = client.get("income")
        if income_max is not None and client_income is not None and client_income > income_max:
            eligible = False

        # Education check
        scheme_edu = criteria.get("education")
        client_edu = client.get("education")
        if scheme_edu and client_edu and scheme_edu.lower() not in client_edu.lower():
            eligible = False

        # Other conditions
        for cond in criteria.get("other_conditions", []):
            if cond not in client.get("other_conditions", []):
                eligible = False

        if eligible:
            eligible_schemes.append(scheme)

    return eligible_schemes

# --------------------------
# STEP 7: RUN QUERY FOR CLIENT
# --------------------------
if __name__ == "__main__":
    client_profile = {
        "age": 20,
        "gender": "female",
        "income": 200000,
        "education": "B.Tech",
        "other_conditions": []
    }

    # Find eligible schemes
    eligible_schemes = check_eligibility(client_profile, eligibility_data)
    print("‚úÖ Eligible schemes for client:", eligible_schemes)

    # Retrieve context & get LLM answers for each eligible scheme
    for scheme in eligible_schemes:
        context = retrieve_context(scheme)
        answer = generate_with_llm(scheme, context, llm_choice="gemini")
        print(f"\n=== {scheme} ===")
        print(answer)
