# 📘 Insurance Document Smart Extractor with RAG + FastAPI UI

In [2]:
# 📦 Step 1: Install Required Libraries
!pip install pytesseract pdfplumber opencv-python pillow fastapi uvicorn python-multipart transformers sentence-transformers haystack[all] --quiet

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [10]:
!pip install farm-haystack[all]

Defaulting to user installation because normal site-packages is not writeable
Collecting farm-haystack[all]
  Obtaining dependency information for farm-haystack[all] from https://files.pythonhosted.org/packages/83/40/c707a34de669f9715701ada14b6165726f480c6673e32774b83b5696ca3f/farm_haystack-1.26.4.post0-py3-none-any.whl.metadata
  Downloading farm_haystack-1.26.4.post0-py3-none-any.whl.metadata (28 kB)
Collecting boilerpy3 (from farm-haystack[all])
  Obtaining dependency information for boilerpy3 from https://files.pythonhosted.org/packages/d9/b1/e376edbdc1f1755fdb6cb1f6173b2a7afa8a6d766f7d10e34e7db0c18510/boilerpy3-1.0.7-py3-none-any.whl.metadata
  Downloading boilerpy3-1.0.7-py3-none-any.whl.metadata (5.8 kB)
Collecting events (from farm-haystack[all])
  Obtaining dependency information for events from https://files.pythonhosted.org/packages/25/ed/e47dec0626edd468c84c04d97769e7ab4ea6457b7f54dcb3f72b17fcd876/Events-0.5-py3-none-any.whl.metadata
  Downloading Events-0.5-py3-none-any.wh

  error: subprocess-exited-with-error
  
  Building wheel for faiss-cpu (pyproject.toml) did not run successfully.
  exit code: 1
  
  [21 lines of output]
  !!
  
          ********************************************************************************
          Please consider removing the following classifiers in favor of a SPDX license expression:
  
          License :: OSI Approved :: MIT License
  
          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
          ********************************************************************************
  
  !!
    self._finalize_license_expression()
  running bdist_wheel
  running build
  running build_py
  running build_ext
  building 'faiss._swigfaiss' extension
  swigging faiss\faiss\python\swigfaiss.i to faiss\faiss\python\swigfaiss_wrap.cpp
  swig.exe -python -c++ -Doverride= -I/usr/local/include -Ifaiss -doxygen -DSWIGWIN -o faiss\faiss\python\swigfaiss_wrap.cpp faiss\faiss\python\sw

In [3]:
# 📚 Step 2: Import Libraries
import pytesseract
from PIL import Image
import pdfplumber
import re
import os
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from transformers import pipeline
from sentence_transformers import SentenceTransformer

ModuleNotFoundError: No module named 'haystack.document_stores'

## 🖼️ Step 3: Image to Text

In [None]:
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    return pytesseract.image_to_string(img)

## 📄 Step 4: PDF to Text

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

## 🔍 Step 5: Regex Field Extractor

In [None]:
def extract_fields(text):
    patterns = {
        'Policy Number': r'Policy\s*No\.?\s*[:\-]?\s*(\S+)',
        'Insured Name': r'Insured\s*[:\-]?\s*([A-Z][a-zA-Z\s]+)',
        'Effective Date': r'Effective\s+Date\s*[:\-]?\s*(\d{2,4}[\/\-]\d{1,2}[\/\-]\d{2,4})',
        'VIN': r'VIN\s*[:\-]?\s*(\w{8,17})',
    }
    return {k: re.search(p, text).group(1) if re.search(p, text) else None for k, p in patterns.items()}

## 🧠 Step 6: RAG Model Setup

In [None]:
# You should run this only once to initialize and embed
def setup_rag(ocr_text):
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
    docs = [{"content": chunk} for chunk in ocr_text.split("\n") if chunk.strip() != ""]
    document_store.write_documents(docs)

    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
    )
    document_store.update_embeddings(retriever)

    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
    pipe = ExtractiveQAPipeline(reader, retriever)
    return pipe

## ❓ Step 7: Ask RAG

In [None]:
def ask_question(pipe, query):
    result = pipe.run(query=query, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})
    return result['answers'][0].answer if result['answers'] else None

## 🚀 Step 8: FastAPI Upload + Convert API

In [None]:
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/upload")
async def upload(file: UploadFile = File(...)):
    ext = file.filename.split(".")[-1].lower()
    contents = await file.read()
    temp_path = f"temp.{ext}"
    with open(temp_path, "wb") as f:
        f.write(contents)

    text = extract_text_from_image(temp_path) if ext in ["jpg", "jpeg", "png"] else extract_text_from_pdf(temp_path)
    fields = extract_fields(text)
    os.remove(temp_path)
    return JSONResponse(content=fields)

## 🧪 Step 9: Sample Call to RAG

In [None]:
# Uncomment and test locally
text = extract_text_from_pdf("sample.pdf")
pipe = setup_rag(text)
print(ask_question(pipe, "What is the policy number?"))