In [None]:

# 🛠️ Install required packages
!pip install google-generativeai langchain langchain-google-genai langchain-community faiss-cpu spacy pytesseract pdfplumber pillow
!python -m spacy download en_core_web_sm


In [None]:

import google.generativeai as genai
import pytesseract
from PIL import Image
import pdfplumber
import re
import spacy

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

# Configure Gemini API
genai.configure(api_key="AIzaSyARNdHxYRBSgJLpeWZ0bqmt1qcIq9vra5M")
nlp = spacy.load("en_core_web_sm")

MODEL_NAME = "models/gemini-2.0-flash"
EMBEDDING_MODEL = "models/text-embedding-004"


In [None]:

def classify_query(query):
    prompt = f"""
You are a legal assistant for Indian law.

Classify the following legal query as one of:
- "general": if it is a basic informational or knowledge-seeking question like:
    • What is IPC 302?
    • What is the punishment for theft?
    • Explain Article 21 of the Constitution.

- "analysis": if it contains a real-world or hypothetical situation that requires:
    • legal reasoning,
    • section applicability,
    • factual interpretation,
    • or identifying applicable acts/penal codes based on the scenario.

Examples of "analysis":
    • If a man slaps someone during an argument in public, what IPC section applies?
    • My landlord is not returning my deposit—what can I do?
    • What is the remedy if an employer doesn’t pay salary?

Reply ONLY with: general or analysis

Query: {query}
"""
    model = genai.GenerativeModel(model_name=MODEL_NAME)
    response = model.generate_content(prompt)
    return response.text.strip().lower()

def handle_general_query_with_gemini(query):
    model = genai.GenerativeModel(model_name=MODEL_NAME)
    response = model.generate_content(f"Answer the following legal question for India: {query}")
    return response.text.strip()


In [None]:

def extract_text_from_image(image_path):
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    return pytesseract.image_to_string(Image.open(image_path))

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return " ".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def basic_clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\.,]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def spacy_preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])


In [None]:

def prepare_for_embedding(processed_text):
    docs = [Document(page_content=chunk) for chunk in CharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_text(processed_text)]
    embeddings = GoogleGenerativeAIEmbeddings(
        model=EMBEDDING_MODEL,
        google_api_key="AIzaSyARNdHxYRBSgJLpeWZ0bqmt1qcIq9vra5M",
        task_type="retrieval_document"
    )
    vectorstore = FAISS.from_documents(docs, embeddings)
    print("✅ Text embedded using Gemini and stored in FAISS.")
    return vectorstore


In [None]:

user_query = input("Enter your legal query: ")
classification = classify_query(user_query)
print(f"🧾 Query classified as: {classification}")

if classification == "general":
    print("🔁 Routing to Gemini for general query...")
    response = handle_general_query_with_gemini(user_query)
    print(f"🧠 Gemini Response:\n{response}")

else:
    print("📄 Analysis query detected. Proceeding with OCR pipeline...")
    file_type = input("Is your file a PDF or image? (pdf/img): ").strip().lower()
    file_path = input("Enter full path to the file: ").strip()

    if file_type == "pdf":
        raw_text = extract_text_from_pdf(file_path)
    elif file_type == "img":
        raw_text = extract_text_from_image(file_path)
    else:
        raise ValueError("❌ Invalid file type.")

    cleaned_text = basic_clean(raw_text)
    processed_text = spacy_preprocess(cleaned_text)

    print("\n✅ Preprocessed Text for NER/Embedding:")
    print(processed_text[:3000])

    prepare_for_embedding(processed_text)
