In [1]:
!pip install faiss-cpu pymupdf sentence-transformers tiktoken langchain




[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
import fitz  # PyMuPDF for PDF reading
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Path to folder containing judgments
JUDGMENT_FOLDER = r"C:\projects\legal_bot\judgements"

# Load embedding model (free Hugging Face model)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

# Load all PDFs from the folder
pdf_files = [os.path.join(JUDGMENT_FOLDER, f) for f in os.listdir(JUDGMENT_FOLDER) if f.endswith(".pdf")]
documents = [extract_text_from_pdf(pdf) for pdf in pdf_files]

# Split text into chunks for better embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = [chunk for doc in documents for chunk in text_splitter.split_text(doc)]

# Generate embeddings and store in FAISS
vector_store = FAISS.from_texts(chunks, embedding_model)

# Save FAISS index
vector_store.save_local("faiss_index")

# Load FAISS index for querying
vector_store = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

# Function to perform a query
def query_rag(query):
    docs = vector_store.similarity_search(query, k=3)  # Retrieve top 3 relevant documents
    return [doc.page_content for doc in docs]

# Example query
query = "What was the judgment on corporate tax evasion?"
results = query_rag(query)

# Print retrieved chunks
for i, res in enumerate(results):
    print(f"\nResult {i+1}:\n{res}")



Result 1:
Revenue. 
26. The appellant Trust, aggrieved by the judgment delivered by the single 
judge, filed an appeal in D.B. Special Appeal Writ No. 20/2012. The division 
bench of the High Court in its order dated 01.12.2014 observed that the 
appellant Trust had only reiterated the submissions made before the single 
judge. The said submissions were considered at length by the single judge 
and therefore, did not warrant any interference from the division bench.

Result 2:
Page 5 of 57 
 
12. In the meantime, the Registrar, Board of Revenue, Ajmer sent a letter dated 
22.03.1980 to the appellant Trust apprising them of the ongoing litigation 
before the Revenue Appellate Authority with respect to the land bearing 
Survey no. 229 and instructed the appellant Trust not to deposit the 
compensation amount till the final decision of the appeal.  
13. According to the appellant Trust, the memo of handing over of the

Result 3:
instead fall within the amplitude of Entry 11-A, List III. 

In [31]:
import os
import fitz  # PyMuPDF for PDF reading
import faiss
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Path to folder containing judgments
JUDGMENT_FOLDER = r"C:\projects\legal_bot\judgements"


# Load embedding model (free Hugging Face model)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text("text") for page in doc])

# Load all PDFs from the folder
pdf_files = [os.path.join(JUDGMENT_FOLDER, f) for f in os.listdir(JUDGMENT_FOLDER) if f.endswith(".pdf")]

# Store document chunks with metadata (filename)
documents = []
for pdf_path in pdf_files:
    text = extract_text_from_pdf(pdf_path)
    file_name = os.path.basename(pdf_path)  # Extract only filename

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)

    # Store chunks as LangChain Document objects with metadata
    for chunk in chunks:
        documents.append(Document(page_content=chunk, metadata={"source": file_name}))

# Generate embeddings and store in FAISS
vector_store = FAISS.from_documents(documents, embedding_model)

# Save FAISS index
vector_store.save_local("faiss_index")

# Load FAISS index for querying
vector_store = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

# Function to perform a query and return text with source PDF file name
def query_rag(query):
    docs = vector_store.similarity_search(query, k=3)  # Retrieve top 3 relevant chunks
    return [(doc.page_content, doc.metadata["source"]) for doc in docs]

# Example query
query = "What was the judgment on corporate tax evasion?"
results = query_rag(query)

# Print retrieved results with file names
for i, (text, source) in enumerate(results):
    print(f"\nResult {i+1} (From: {source}):\n{text}")



Result 1 (From: 3369_2010_15_1502_58012_Judgement_13-Dec-2024.pdf):
Revenue. 
26. The appellant Trust, aggrieved by the judgment delivered by the single 
judge, filed an appeal in D.B. Special Appeal Writ No. 20/2012. The division 
bench of the High Court in its order dated 01.12.2014 observed that the 
appellant Trust had only reiterated the submissions made before the single 
judge. The said submissions were considered at length by the single judge 
and therefore, did not warrant any interference from the division bench.

Result 2 (From: 3369_2010_15_1502_58012_Judgement_13-Dec-2024.pdf):
Page 5 of 57 
 
12. In the meantime, the Registrar, Board of Revenue, Ajmer sent a letter dated 
22.03.1980 to the appellant Trust apprising them of the ongoing litigation 
before the Revenue Appellate Authority with respect to the land bearing 
Survey no. 229 and instructed the appellant Trust not to deposit the 
compensation amount till the final decision of the appeal.  
13. According to the app