In [None]:
# python-backend/app.py
import os
import re
import tempfile
import json
from flask import Flask, request, jsonify
from flask_cors import CORS
from dotenv import load_dotenv

# Langchain and related imports
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document

# For DOCX handling
try:
    from docx import Document as DocxDocument
except ImportError:
    print("python-docx not found. Please install it using: pip install python-docx")
    DocxDocument = None

# Load environment variables
load_dotenv()
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY must be set in your .env file")

app = Flask(__name__)
CORS(app)

# Global variables
vectorstore = None
llm = None
gemini_embedder = None
qa_chain = None

def initialize_rag_components():
    global llm, gemini_embedder
    print("Initializing RAG Components...")
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.7
    )
    gemini_embedder = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001"
    )
    print("RAG Components Initialized.")

initialize_rag_components()

PROMPT_TEMPLATE = PromptTemplate(
    template="""
    You are an exceptionally precise and rigorous insurance policy assistant. Your core function is to act as a highly reliable RAG (Retrieval-Augmented Generation) system, providing definitive and fully justified decisions based solely on the provided "Policy Clauses" and "User Query."
    Absolute Principle: Your final "decision" (approved/rejected) and every part of your "justification" MUST be perfectly aligned and directly, verifiably supported by the text within the "Policy Clauses." Do NOT invent information or make assumptions beyond what is explicitly stated. If a claim is rejected, the justification MUST clearly explain why it was rejected, referencing the specific clauses that lead to the rejection.
    Your Step-by-Step Reasoning Process (Internal Thought Process - Do NOT output this):
    Deconstruct the User Query:
    Identify Core Intent: What is the user fundamentally asking for (e.g., claim approval, payout amount for a specific event)?
    Extract Key Parameters: Parse and structure all relevant details from the "User Query." Prioritize:
    Insured's Age: (e.g., "65 years old")
    Claim/Procedure Type: (e.g., "medical check-up," "car accident," "life insurance claim")
    Specific Event/Condition: (e.g., "broken arm," "stolen vehicle," "diagnosis of cancer")
    Location of Event: (e.g., "within the USA," "abroad")
    Policy Timelines: (e.g., "policy active for 3 years," "event occurred last month")
    Financial Details: (e.g., "deductible paid," "premium status")
    Handle Ambiguity/Missing Info: If any critical parameter is vague, incomplete, or absent, explicitly note this internally. Consider what information is missing that would be required by a typical policy clause for approval.
    Precise Clause Retrieval & Semantic Matching:
    Deep Reading: Thoroughly read and semantically understand every "Policy Clause." Focus on conditions, exclusions, definitions, and payout schedules. Avoid superficial keyword matching.
    Relevance Mapping: Identify all and only the clauses that are directly and unequivocally relevant to the parsed details from the user's query. This requires careful inference based on meaning, not just words.
    Cross-Reference: Compare the extracted query parameters against the conditions and rules in the relevant clauses.
    Rigorous Decision Logic & Calculation:
    Decision Determination: Based strictly on the cross-referenced information, determine if the claim is "APPROVED" or "REJECTED."
    If all conditions for approval are met and no exclusions apply, the decision MUST be "approved."
    Amount Calculation: If the decision is "approved," calculate the exact "amount" based only on the payout rules and schedules specified in the relevant clauses. If the amount is not explicitly stated or cannot be calculated from the provided clauses, set it to 0 and explain why in the justification. If the decision is "rejected", the amount should be 0.
    Comprehensive Justification & Clause Mapping (MUST Align with Final Decision):
    Construct Explanation: Formulate a clear and concise explanation that directly supports the determined "decision" (approved or rejected) and "amount."
    Verbatim Clause Quotation: For every point in your explanation, identify the exact, verbatim text of the policy clause(s) that directly support it.
    Precise Location: Provide the precise "location" for each quoted clause (e.g., "Page X, Clause Y", "Section Z, Paragraph A"). If the location is not provided in the input, state "Location not specified in context."
    Address Missing Information / Rejection Reasons (Crucially):
    If the decision is "rejected," your justification MUST explicitly state the reason for rejection. This includes:
    Clearly stating what specific information was missing from the "User Query" that prevented approval.
    Explaining why that information was critical by referencing the specific clause(s) that require it.
    Referencing any applicable exclusion clauses that led to the rejection.
    Do NOT make up information. You CAN interpret the meaning and implications from the document, but all facts must originate from the provided clauses.
    Output Format (Strictly ONLY this JSON object - No other text, no conversational remarks):

    {{
      "decision": "approved/rejected",
      "amount": 100000,
      "justification": {{
        "clauses": [
          {{
            "text": "...",
            "location": "Page X, Clause Y"
          }}
        ]
      }}
    }}

    User Query:
    {question}

    Policy Clauses:
    {context}

    Final Review (Internal - Do NOT output): Before generating the JSON, mentally cross-check:
    Is the decision field consistent with the justification provided?
    Is the amount correctly calculated or justified as 0?
    Are all quoted clauses verbatim and correctly attributed with location?
    If rejected, is the reason for rejection clearly explained in the justification, with clause references?
    Is the output only the JSON object?
    """,
    input_variables=["question", "context"]
)

# In app.py, REPLACE the old function with this final version.

def verify_and_correct_decision(initial_result: dict) -> dict:
    """
    Audits a 'rejected' decision using a more sophisticated logic to prevent
    false approvals for irrelevant queries.

    Args:
        initial_result: The dictionary parsed from the initial RAG JSON output.

    Returns:
        The corrected dictionary.
    """
    print("Executing sophisticated audit and correction step...")
    # We only intervene if the decision was 'rejected'.
    if initial_result.get("decision") != "rejected":
        return initial_result

    try:
        clauses = initial_result.get("justification", {}).get("clauses", [])
        # If there's no justification at all, trust the rejection (likely for an irrelevant query)
        if not clauses:
            print("No justification clauses found. Assuming rejection is correct for irrelevant query.")
            return initial_result
        
        justification_text = "\n".join([clause.get("text", "") for clause in clauses])

        # Create a new, separate LLM instance for the audit.
        auditor_llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0.8
        )

        # A much more sophisticated prompt to handle both error types.
        audit_prompt = f"""
        You are an insurance claim auditor. Your task is to review a 'rejected' claim decision based on its justification.

        Here is the justification provided for the rejection, which quotes clauses from a policy document:
        --- JUSTIFICATION TEXT ---
        {justification_text}
        ---

        Analyze this justification and determine if the 'rejected' decision was an error. Consider two scenarios:

        1.  **Error**: The justification text is weak, irrelevant, or contradicts a 'rejected' decision. For example, it only lists positive-sounding clauses of coverage, or the clauses have nothing to do with a typical insurance claim. This would make the rejection an 'error'.

        2.  **Correct**: The justification text clearly explains the rejection (e.g., by citing an exclusion) OR the quoted text shows that the user's request (which you cannot see) is simply not a topic covered by the policy at all. This would make the rejection 'correct'.

        Based on the justification text, which scenario is more likely?
        Answer with only the single word 'error' or 'correct'.
        """

        print("Invoking sophisticated auditor model...")
        audit_response = auditor_llm.invoke(audit_prompt)
        audit_answer = audit_response.content.strip().lower()
        print(f"Auditor model classified the rejection as: '{audit_answer}'")

        # *** FINAL LOGIC ***
        # Only override the decision if the auditor explicitly classifies the rejection as an 'error'.
        if "error" in audit_answer:
            print("CORRECTION: The auditor found the rejection to be an error. Overriding decision to 'approved'.")
            corrected_result = initial_result.copy()
            corrected_result["decision"] = "approved"
            return corrected_result
        else:
            print("Audit confirms the 'rejected' decision is correct. No changes made.")
            return initial_result

    except Exception as e:
        print(f"An error occurred during the audit step: {e}")
        return initial_result

@app.route('/upload-document', methods=['POST'])
def upload_document():
    global vectorstore, qa_chain
    print("Received upload request for Python backend.")
    if 'file' not in request.files:
        return jsonify({"message": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"message": "No selected file"}), 400
    if file:
        file_extension = file.filename.split('.')[-1].lower()
        if file_extension not in ['pdf', 'txt', 'docx']:
            return jsonify({"message": "Unsupported file type. Please upload PDF, TXT, or DOCX."}), 400
        temp_file_path = None
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as temp_file:
                file.save(temp_file.name)
                temp_file_path = temp_file.name
            print(f"File saved to temporary path: {temp_file_path}")
            pages = []
            if file_extension == 'pdf':
                loader = PyPDFLoader(temp_file_path)
                pages = loader.load()
            elif file_extension == 'txt':
                loader = TextLoader(temp_file_path)
                pages = loader.load()
            elif file_extension == 'docx':
                if DocxDocument:
                    doc = DocxDocument(temp_file_path)
                    full_text = "\n".join([para.text for para in doc.paragraphs])
                    pages = [Document(page_content=full_text, metadata={"source": file.filename, "file_type": "docx"})]
                else:
                    raise ImportError("python-docx is not installed. Cannot process .docx files.")
            print(f"Loaded {len(pages)} base content objects from document.")
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=100,
                length_function=len,
                separators=["\n\n", "\n", " "]
            )
            docs = text_splitter.split_documents(pages)
            print(f"Split document into {len(docs)} chunks.")

            # === MODIFICATION: EMBED METADATA INTO DOCUMENT CONTENT ===
            for doc in docs:
                # Format the metadata into a readable string.
                # PyPDFLoader is 0-indexed, so we add 1 for human-readable page numbers.
                source = doc.metadata.get('source', 'N/A').split('/')[-1] # Get just the filename
                page = doc.metadata.get('page')
                location_str = f"Source: {source}"
                if page is not None:
                    location_str += f", Page: {page + 1}"

                # Prepend the location to the page content.
                doc.page_content = f"--- METADATA ---\nLocation: {location_str}\n--- CONTENT ---\n{doc.page_content}"
            print("Embedded location metadata into document chunks.")
            # === END MODIFICATION ===

            vectorstore = FAISS.from_documents(docs, embedding=gemini_embedder)
            print("FAISS vectorstore created.")
            retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
            qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                retriever=retriever,
                return_source_documents=True,
                chain_type_kwargs={"prompt": PROMPT_TEMPLATE}
            )
            print("RAG QA Chain initialized.")
            return jsonify({"message": f"File '{file.filename}' processed successfully. RAG pipeline initialized."}), 200
        except ImportError as ie:
            print(f"Error: {ie}")
            return jsonify({"message": f"Server error: {str(ie)}. Please ensure all required libraries are installed."}), 500
        except Exception as e:
            print(f"Error processing file: {e}")
            return jsonify({"message": f"Error processing file: {str(e)}"}), 500
        finally:
            if temp_file_path and os.path.exists(temp_file_path):
                os.remove(temp_file_path)
                print(f"Temporary file removed: {temp_file_path}")

@app.route('/ask-query', methods=['POST'])
def ask_query():
    global qa_chain
    print("Received query request for Python backend.")
    if not qa_chain:
        return jsonify({"message": "No document uploaded or RAG pipeline not initialized."}), 400
    data = request.get_json()
    query = data.get('query')
    if not query:
        return jsonify({"message": "No query provided"}), 400
    try:
        print(f"Processing query: '{query}'")
        result = qa_chain({"query": query})
        raw_output_string = result.get('result', '')
        json_match = re.search(r'\{.*\}', raw_output_string, re.DOTALL)
        cleaned_json_string = ""
        if json_match:
            cleaned_json_string = json_match.group(0)
            print("Successfully extracted JSON block from raw LLM output.")
        else:
            print("WARNING: Could not find a JSON block in the LLM output.")
            cleaned_json_string = raw_output_string
        try:
            parsed_output = json.loads(cleaned_json_string)
            print("LLM output parsed successfully.")
            
            final_output = verify_and_correct_decision(parsed_output)
            
            return jsonify(final_output), 200
        except json.JSONDecodeError:
            print(f"ERROR: Failed to parse even the cleaned string as JSON.")
            error_response = {
                "error": "Failed to parse LLM output as JSON", 
                "raw_output": raw_output_string
            }
            return jsonify(error_response), 500
    except Exception as e:
        print(f"An unexpected error occurred in the RAG chain: {e}")
        return jsonify({"message": f"Error processing query: {str(e)}"}), 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5001, debug=True, use_reloader=False)

Initializing RAG Components...
RAG Components Initialized.
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5001
 * Running on http://10.203.165.181:5001
Press CTRL+C to quit


Received upload request for Python backend.
File saved to temporary path: C:\Users\adars\AppData\Local\Temp\tmp28wkmjv5.pdf
Loaded 49 base content objects from document.
Split document into 491 chunks.
Embedded location metadata into document chunks.


127.0.0.1 - - [29/Jul/2025 19:30:16] "POST /upload-document HTTP/1.1" 200 -


FAISS vectorstore created.
RAG QA Chain initialized.
Temporary file removed: C:\Users\adars\AppData\Local\Temp\tmp28wkmjv5.pdf
Received query request for Python backend.
Processing query: '46-year-old male, playing ludo in Pune, 3-month-old insurance policy'
Successfully extracted JSON block from raw LLM output.
LLM output parsed successfully.
Executing sophisticated audit and correction step...
Invoking sophisticated auditor model...


127.0.0.1 - - [29/Jul/2025 19:31:37] "POST /ask-query HTTP/1.1" 200 -


Auditor model classified the rejection as: 'error'
CORRECTION: The auditor found the rejection to be an error. Overriding decision to 'approved'.


In [1]:
!pip freeze > requirements.txt