In [16]:
# STEP 1 — Install dependencies
!pip install sentence-transformers faiss-cpu



# Building API for the server

In [17]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

In [18]:
# 10 laws list
law_names = [
    "THE MINES ACT, 1952",
    "THE EXPLOSIVES ACT, 1884",
    "THE COAL BEARING AREAS (ACQUISITION AND DEVELOPMENT) ACT, 1957",
    "THE MINES AND MINERALS (DEVELOPMENT AND REGULATION) ACT, 1957",
    "THE MINES RESCUE RULES, 1985",
    "THE METALLIFEROUS MINES REGULATIONS, 1961",
    "Mines Vocational Training Rules, 1966",
    "THE MINES RULES, 1955",
    "THE MINES CRECHE RULES, 1966",
    "CENTRAL ELECTRICITY AUTHORITY REGULATIONS, 2023"
]

In [19]:
# Load text
file_path = "/content/Mining Laws.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()


In [20]:
# checking for laws presence
print("\nChecking laws in file:")
for law in law_names:
    if law.lower() in text_data.lower():
        print(f"✅ Found: {law}")
    else:
        print(f"❌ Missing: {law}")



Checking laws in file:
✅ Found: THE MINES ACT, 1952
✅ Found: THE EXPLOSIVES ACT, 1884
✅ Found: THE COAL BEARING AREAS (ACQUISITION AND DEVELOPMENT) ACT, 1957
✅ Found: THE MINES AND MINERALS (DEVELOPMENT AND REGULATION) ACT, 1957
✅ Found: THE MINES RESCUE RULES, 1985
✅ Found: THE METALLIFEROUS MINES REGULATIONS, 1961
✅ Found: Mines Vocational Training Rules, 1966
✅ Found: THE MINES RULES, 1955
✅ Found: THE MINES CRECHE RULES, 1966
❌ Missing: CENTRAL ELECTRICITY AUTHORITY REGULATIONS, 2023


In [21]:
# Splitting the text by law and chunk it
def split_by_laws(text, laws):
    sections = {}
    lower_text = text.lower()
    sorted_laws = sorted(laws, key=lambda x: lower_text.index(x.lower()) if x.lower() in lower_text else 1e9)

    for i, law in enumerate(sorted_laws):
        start = lower_text.find(law.lower())
        end = lower_text.find(sorted_laws[i+1].lower()) if i+1 < len(sorted_laws) else len(text)
        if start != -1:
            law_text = text[start:end].strip()
            sections[law] = law_text
    return sections

law_sections = split_by_laws(text_data, law_names)

In [22]:
def split_into_chunks_with_metadata(law_name, text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append({"law": law_name, "text": chunk})
    return chunks

chunks_with_meta = []
for law, section_text in law_sections.items():
    chunks_with_meta.extend(split_into_chunks_with_metadata(law, section_text))

print(f"\nTotal chunks created: {len(chunks_with_meta)}")



Total chunks created: 433


In [9]:
# ==== STEP 4 — Embeddings ====
model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode([c["text"] for c in chunks_with_meta], convert_to_numpy=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Store in FAISS
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)


In [11]:
from transformers import pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [12]:
# ==== STEP 6 — Search function with summarization ====
def search_law(query, top_k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        law_name = chunks_with_meta[idx]["law"]
        chunk_text = chunks_with_meta[idx]["text"]
        summary = summarizer(chunk_text, max_length=60, min_length=15, do_sample=False)[0]['summary_text']
        results.append({
            "law": law_name,
            "distance": dist,
            "text": chunk_text,
            "summary": summary
        })
    return results


In [13]:
# ==== STEP 7 — Test query ====
user_query = "What safety measures should be provided to miners working underground?"
results = search_law(user_query, top_k=3)

print("\n🔍 Relevant Sections:\n")
for r in results:
    print(f"📜 Law: {r['law']}")
    print(f"Summary: {r['summary']}")
    print(f"(Match score: {r['distance']:.4f})\n")


🔍 Relevant Sections:

📜 Law: THE MINES RULES, 1955
Summary:  The height of every main drive shall b not less than 1.8 metres . The dimensions of pillars or blocks formed in any vein, load, reef or mineral bed or deposit shall be such as to ensure stability of the workings during the development and stoping stages .
(Match score: 0.8308)

📜 Law: THE MINES RULES, 1955
Summary:  The mining mate or other competent person accompanied by such assistants as may be required shall inspect every part of the mine or district assigned to him, in which persons have to work or pass during the shift . The inspection shall be made within two hours before the connection of work in a shift .
(Match score: 0.8428)

📜 Law: THE MINES RULES, 1955
Summary:  Every entrance from a roadway in a mine to a part of the mine which is neither being worked nor being used for any purpose, by reason of any cause whatsoever, shall be provided with a fence, barrier or gate so designed and constructed as to prevent any p

# API Setup

In [15]:
import pickle
import faiss

# Save FAISS index
faiss.write_index(index, "mining_laws.index")

# Save chunks (the variable is chunks_with_meta, not chunks)
with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks_with_meta, f)

# Save embedding model name (so we reload the same model later)
with open("model_name.txt", "w") as f:
    f.write("all-MiniLM-L6-v2")
