In [13]:
from datasets import load_dataset
import pandas as pd
import re
import string
from sentence_transformers import SentenceTransformer
import faiss
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

ds = load_dataset("mratanusarkar/Indian-Laws")

ds['train'].to_csv('train.csv')

df = pd.read_csv('train.csv')

sentences = df['act_title'].tolist()

sentences.extend(df['section'].tolist())

sentences.extend(df['law'].tolist())

sentences = [word for word in list(set(sentences)) if type(word) is str]

def preprocess_legal_text(text: str) -> str:
    """Clean and preprocess legal text"""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers and headers/footers
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text)

    # Clean up common legal document artifacts
    text = re.sub(r'_+', '', text)  # Remove underscores
    text = re.sub(r'-{2,}', '', text)  # Remove multiple dashes

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    return text.strip()

sentences = [preprocess_legal_text(word) for word in list(set(sentences)) if type(word) is str]
print('Print preprocessed sentences')

# initialize sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2',device=device)
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
print(sentence_embeddings.shape)

dim = sentence_embeddings.shape[1]

# Build FAISS index
index = faiss.IndexFlatL2(dim)
index.add(sentence_embeddings)
print(index)
print('\n\nNow Try an query to be solved')



Using device: cuda


Creating CSV from Arrow format: 100%|██████████| 35/35 [00:02<00:00, 14.65ba/s]


Print preprocessed sentences
(35115, 384)
<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001C4EF6EBD50> >


Now Try an query to be solved


In [17]:
k = 5
query_embedding = model.encode(["The aadhar act 1996"], convert_to_tensor=True)
query_embedding = query_embedding.cpu().detach().numpy()
# Search in FAISS
distances, indices = index.search(query_embedding, k)

# Option 1: retrieve from cleaned sentences
retrieved_chunks = [sentences[idx] for idx in indices[0]]

# Option 2: retrieve full dataframe row (comment out above if using this)
# retrieved_chunks = [df.iloc[idx].to_dict() for idx in indices[0]]

context = "\n".join(map(str, retrieved_chunks))
print("Retrieved Context:\n", context)

Retrieved Context:
 the aadhaar targeted delivery of financial and other subsidies benefits and services act   savings anything done or any action taken by the central government under the resolution of the government of india planning commission bearing notification number aadmin i dated the th january  or by the department of electronics and information technology under the cabinet secretariat notification bearing notification number so e dated the th september  as the case may be shall be deemed to have been validly done or taken under this act
the aadhaar targeted delivery of financial and other subsidies benefits and services act   act to apply for offence or contravention committed outside india  subject to the provisions of subsection  the provisions of this act shall apply also to any offence or contravention committed outside india by any person irrespective of his nationality  for the purposes of subsection  the provisions of this act shall apply to any offence or contraventi

In [20]:
import warnings
warnings.filterwarnings('ignore')
from transformers import pipeline

# Load a free HuggingFace text-generation / summarization pipeline
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def ask_legal_question(query, k=5):
    # 1. Encode query
    query_embedding = model.encode([query])

    # 2. Retrieve top-k results from FAISS
    distances, indices = index.search(query_embedding, k)
    retrieved_chunks = [sentences[idx] for idx in indices[0]]

    # 3. Build context
    context = "\n".join(retrieved_chunks)

    # 4. Build prompt for open-source model
    prompt = f"""
    You are a legal assistant.
    Use ONLY the following legal context to answer the query.
    If the answer cannot be found, reply: "Not found in the provided laws".

    Query: {query}
    Context:
    {context}
    Answer:
    """

    # 5. Generate response
    response = qa_pipeline(prompt, max_length=256, clean_up_tokenization_spaces=True)
    return response[0]["generated_text"]

# Example run
print(ask_legal_question("The aadhar act 1996"))


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Not found in the provided laws
