In [1]:
from datasets import load_dataset
import pandas as pd
import re
import string
from sentence_transformers import SentenceTransformer
import faiss
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

ds = load_dataset("mratanusarkar/Indian-Laws")

ds['train'].to_csv('train.csv')

df = pd.read_csv('train.csv')

sentences = df['act_title'].tolist()

sentences.extend(df['section'].tolist())

sentences.extend(df['law'].tolist())

sentences = [word for word in list(set(sentences)) if type(word) is str]

def preprocess_legal_text(text: str) -> str:
    """Clean and preprocess legal text"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers and headers/footers
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)

    # Clean up common legal document artifacts
    text = re.sub(r'_+', '', text)  # Remove underscores
    text = re.sub(r'-{2,}', '', text)  # Remove multiple dashes

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    return text.strip()

sentences = [preprocess_legal_text(word) for word in list(set(sentences)) if type(word) is str]
print('Print preprocessed sentences')

# initialize sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',device=device)
# create sentence embeddings
sentence_embeddings = model.encode(sentences,show_progress_bar=True,batch_size=64,convert_to_numpy=True,)
print(sentence_embeddings.shape)

dim = sentence_embeddings.shape[1]

# Build FAISS index
index = faiss.IndexFlatL2(dim)
index.add(sentence_embeddings)
print(index)
print('\n\nNow Try an query to be solved')

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Creating CSV from Arrow format: 100%|██████████| 35/35 [00:01<00:00, 26.96ba/s]


Print preprocessed sentences


Batches: 100%|██████████| 549/549 [01:40<00:00,  5.48it/s]


(35115, 384)
<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002778045D470> >


Now Try an query to be solved


In [2]:
k = 5
query_embedding = model.encode(["The aadhar act 1996"], convert_to_tensor=True)
query_embedding = query_embedding.cpu().detach().numpy()
# Search in FAISS
distances, indices = index.search(query_embedding, k)

# Option 1: retrieve from cleaned sentences
retrieved_chunks = [sentences[idx] for idx in indices[0]]

# Option 2: retrieve full dataframe row (comment out above if using this)
# retrieved_chunks = [df.iloc[idx].to_dict() for idx in indices[0]]

context = "\n".join(map(str, retrieved_chunks))
print("Retrieved Context:\n", context)

Retrieved Context:
 the aadhaar targeted delivery of financial and other subsidies benefits and services act   savings anything done or any action taken by the central government under the resolution of the government of india planning commission bearing notification number aadmin i dated the th january  or by the department of electronics and information technology under the cabinet secretariat notification bearing notification number so e dated the th september  as the case may be shall be deemed to have been validly done or taken under this act
the aadhaar targeted delivery of financial and other subsidies benefits and services act   act to apply for offence or contravention committed outside india  subject to the provisions of subsection  the provisions of this act shall apply also to any offence or contravention committed outside india by any person irrespective of his nationality  for the purposes of subsection  the provisions of this act shall apply to any offence or contraventi

In [4]:
from pdf_to_text import extract_text_from_pdf
pdf_path = "D:\College\Rent Agreement.pdf"
text = extract_text_from_pdf(pdf_path)

PDF has 6 pages.


  pdf_path = "D:\College\Rent Agreement.pdf"


In [6]:
from summarize import summarize_rent_agreement
summary = summarize_rent_agreement(text, file_name="Agreement.pdf")
summary

{'file_info': {'name': 'Agreement.pdf', 'type': 'General Agreement/Contract'},
 'date_of_document': 'Not Found',
 'parties': {'licensor': 'Not Found', 'tenant': 'Not Found'},
 'premises': 'Not Found',
 'deposit_amount': 'Not Found',
 'monthly_rent': 'Not Found',
 'duration': 'Not Found',
 'key_terms': [],
 'risk_assessment': {'summary': '⚠️ Potentially risky clauses identified.',
  'recommendation': '✅ **Recommendation:** These keywords indicate clauses that may have significant implications. You **should** thoroughly review these sections with legal counsel. You **should not** sign or agree to the document without fully understanding the impact of these clauses on your rights and obligations.'},
 'identified_risks': [{'keyword': 'termination',
   'context': 'to the Licensor as mentioned earlier.  12) Possession: That the immediately at on the expiration or termination or cancellation of this agreement the Licensees shall vacate the said premises without delay with a'}]}

In [2]:
import warnings
warnings.filterwarnings('ignore')
from transformers import pipeline

from google import genai

client = genai.Client(api_key="AIzaSyDuVsg-vyNkf6CrjgliUDDU2dXjvKSEfbg")
# Load a free HuggingFace text-generation / summarization pipeline
# qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def ask_legal_question(query, k=5):
    # 1. Encode query
    query_embedding = model.encode([query])

    # 2. Retrieve top-k results from FAISS
    distances, indices = index.search(query_embedding, k)
    retrieved_chunks = [sentences[idx] for idx in indices[0]]

    # 3. Build context
    context = "\n".join(retrieved_chunks)

    # 4. Build prompt for open-source model
    prompt = f"""
    You are a legal assistant.
    Use ONLY the following legal context to answer the query.
    If the answer cannot be found, reply: "Not found in the provided laws".

    Query: {query}
    Context:
    {context}
    Answer:
    """
    print("Retrieved Context:\n", context)
    # 5. Generate response
    # response = qa_pipeline(prompt, max_length=256, clean_up_tokenization_spaces=True)
    response = client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
    # return response[0]["generated_text"]
    return response.text

# Example run
print('From the context above, answer the query:\n')
print(ask_legal_question("Abducted Persons"))


From the context above, answer the query:

Retrieved Context:
 abducted persons recovery and restoration continuance act
the abducted persons recovery and restoration continuance act   repeal of ordinance  of   the abducted persons recovery and restoration continuance ordinance   of  is hereby repealed
power to compel restoration of abducted females upon complaint made on oath of the abduction or unlawful detention of a woman or a female child under the age of eighteen years for any unlawful purpose a district magistrate subdivisional magistrate or magistrate of the first class may make an order for the immediate restoration of such woman to her liberty or of such female child to her husband parent guardian or other person having the lawful charge of such child and may compel compliance with such order using such force as may be necessary
the abducted persons recovery and restoration continuance act   amendment of section i in subsection  of section  of the abducted persons recovery an

In [25]:
df['act_title'].unique()

array(['Aadhaar (Targeted Delivery of Financial and other Subsidies, Benefits and Services) Act, 2016',
       'Abducted Persons (Recovery and Restoration) Continuance Act, 1955',
       'Absorbed Areas (Laws) Act, 1954', ...,
       'Workmens Compensation Act, 1923', 'Works of Defence Act, 1903',
       'Young Persons (Harmful Publications) Act, 1956'],
      shape=(1021,), dtype=object)

In [1]:
import torch
torch.cuda.get_device_capability()


(8, 6)