"""
Mining Laws RAG System - Kaggle Notebook Version
================================================
Description: Question answering system for Indian mining laws using RAG
Dataset: https://www.kaggle.com/code/stacky123/embeddings-tokens

Instructions:
1. Add the embeddings dataset as input to your notebook
2. Install required packages (first cell)
3. Run all cells
4. Use interactive mode or modify sample questions
"""

In [1]:
# ============================================================================
# CELL 1: Install Required Packages
# ============================================================================
print(" Installing required packages...")
print("=" * 80)

# Uninstall conflicting versions first
!pip uninstall -y langchain langchain-community langchain-core -q

# Install compatible versions
!pip install -q langchain==0.1.20
!pip install -q langchain-community==0.0.38
!pip install -q langchain-core==0.1.52
!pip install -q chromadb==0.4.24
!pip install -q sentence-transformers==2.3.1
!pip install -q langchain-groq==0.1.3
!pip install -q groq

print(" Packages installed successfully!")
print("\n  Please RESTART the kernel after installation!")
print("   Kernel ‚Üí Restart & Run All\n")

 Installing required packages...
 Packages installed successfully!

  Please RESTART the kernel after installation!
   Kernel ‚Üí Restart & Run All



In [2]:
# ============================================================================
# CELL 2: Import Libraries
# ============================================================================
print("üìö Importing libraries...")

import os
import sys
import json
import warnings
from typing import List, Dict, Optional
from datetime import datetime

warnings.filterwarnings('ignore')

# LangChain imports
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import Document

# LLM imports
from langchain_groq import ChatGroq

print("‚úÖ Libraries imported successfully!\n")

üìö Importing libraries...
‚úÖ Libraries imported successfully!



In [3]:
# ============================================================================
# CELL 3: Path Diagnostic and Configuration
# ============================================================================
import os
import glob

print("="*80)
print("üîç DIAGNOSING VECTOR STORE LOCATION")
print("="*80)

# Search for chroma.sqlite3 in the entire input directory
print("\n1Ô∏è‚É£ Searching for chroma.sqlite3 files...")
chroma_files = glob.glob("/kaggle/input/**/**/chroma.sqlite3", recursive=True)

if chroma_files:
    print(f"‚úÖ Found {len(chroma_files)} chroma.sqlite3 file(s):")
    for i, path in enumerate(chroma_files, 1):
        vector_store_path = os.path.dirname(path)
        print(f"\n   [{i}] {vector_store_path}")
        
        # Check what files are in this directory
        files = os.listdir(vector_store_path)
        print(f"       Files: {', '.join(files[:5])}")
        if len(files) > 5:
            print(f"       ... and {len(files)-5} more files")
    
    # Use the first one found
    DETECTED_PATH = os.path.dirname(chroma_files[0])
    print(f"\n‚úÖ Will use: {DETECTED_PATH}")
else:
    print("‚ùå No chroma.sqlite3 files found!")
    print("\n2Ô∏è‚É£ Let's check what's actually in /kaggle/input/...")
    
    # List the input directory
    if os.path.exists("/kaggle/input"):
        print("\nContents of /kaggle/input/:")
        for item in os.listdir("/kaggle/input"):
            print(f"   üìÅ {item}")
            
            # Check inside embeddingstokens
            if item == "embeddingstokens":
                emb_path = f"/kaggle/input/{item}"
                print(f"\n   Contents of {emb_path}:")
                for subitem in os.listdir(emb_path):
                    subpath = os.path.join(emb_path, subitem)
                    if os.path.isdir(subpath):
                        print(f"      üìÅ {subitem}/")
                        # Go one level deeper
                        try:
                            deeper = os.listdir(subpath)
                            for deep_item in deeper[:5]:
                                print(f"         - {deep_item}")
                            if len(deeper) > 5:
                                print(f"         ... and {len(deeper)-5} more items")
                        except:
                            pass
                    else:
                        print(f"      üìÑ {subitem}")
    
    DETECTED_PATH = None

print("\n" + "="*80)

# Now create the config class
class RAGConfig:
    """Configuration for RAG system"""
    
    def __init__(self, vectorstore_path=None):
        # Use detected path or find it automatically
        if vectorstore_path:
            self.VECTORSTORE_PATH = vectorstore_path
        else:
            # Try to find it automatically
            chroma_files = glob.glob("/kaggle/input/**/**/chroma.sqlite3", recursive=True)
            if chroma_files:
                base_path = os.path.dirname(chroma_files[0])
                
                # Check if we need to go into a UUID subdirectory
                items = os.listdir(base_path)
                
                # Look for directories that look like UUIDs (contain hyphens and are long)
                uuid_dirs = [d for d in items if os.path.isdir(os.path.join(base_path, d)) 
                            and '-' in d and len(d) > 30]
                
                if uuid_dirs:
                    # Check if the UUID directory has more vector store files
                    uuid_path = os.path.join(base_path, uuid_dirs[0])
                    uuid_files = os.listdir(uuid_path)
                    
                    # If UUID dir has data_level0.bin, it's the actual vector store
                    if 'data_level0.bin' in uuid_files:
                        self.VECTORSTORE_PATH = uuid_path
                        print(f"‚úÖ Using UUID subdirectory: {uuid_dirs[0]}\n")
                    else:
                        self.VECTORSTORE_PATH = base_path
                else:
                    self.VECTORSTORE_PATH = base_path
            else:
                raise FileNotFoundError(
                    "‚ùå Could not find vector store!\n"
                    "Please check the diagnostic output above."
                )
        
        # Embedding model (must match the one used to create embeddings)
        self.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
        
        # LLM settings
        self.LLM_PROVIDER = "groq"  
        
        # Retrieval settings
        self.TOP_K_RESULTS = 5
        self.SEARCH_TYPE = "similarity"  
        
        # LLM configurations
        self.LLM_CONFIGS = {
            'groq': {
                'model': 'llama-3.1-8b-instant',
                'temperature': 0.3,
                'max_tokens': 1024
            },
            'openai': {
                'model': 'gpt-4-turbo-preview',
                'temperature': 0.3,
                'max_tokens': 1024
            }
        }
        
        # Output paths
        self.OUTPUT_DIR = "/kaggle/working"
        self.QUERIES_LOG = os.path.join(self.OUTPUT_DIR, "queries_log.json")
    
    def print_config(self):
        """Print configuration"""
        print("\n" + "="*80)
        print("üîß RAG SYSTEM CONFIGURATION")
        print("="*80)
        print(f"üìÅ Vector Store Path:")
        print(f"   {self.VECTORSTORE_PATH}")
        
        # Verify path exists
        if os.path.exists(self.VECTORSTORE_PATH):
            print(f"   ‚úÖ Path exists")
            
            # Count files
            files = os.listdir(self.VECTORSTORE_PATH)
            print(f"   ‚úÖ Contains {len(files)} files")
            
            # Check for key files
            required = ['chroma.sqlite3', 'data_level0.bin', 'header.bin']
            found = [f for f in required if os.path.exists(os.path.join(self.VECTORSTORE_PATH, f))]
            print(f"   ‚úÖ Found {len(found)}/{len(required)} required files")
        else:
            print(f"   ‚ùå Path does not exist!")
        
        print(f"\nü§ñ Embedding Model:    {self.EMBEDDING_MODEL}")
        print(f"üß† LLM Provider:       {self.LLM_PROVIDER}")
        print(f"üîç Top-K Results:      {self.TOP_K_RESULTS}")
        print(f"üéØ Search Type:        {self.SEARCH_TYPE}")
        
        if self.LLM_PROVIDER in self.LLM_CONFIGS:
            config = self.LLM_CONFIGS[self.LLM_PROVIDER]
            print(f"üí¨ LLM Model:          {config.get('model', 'N/A')}")
            print(f"üå°Ô∏è  Temperature:        {config.get('temperature', 'N/A')}")
        
        print("="*80 + "\n")


# Initialize configuration with auto-detected path
try:
    if DETECTED_PATH:
        config = RAGConfig(vectorstore_path=DETECTED_PATH)
        config.print_config()
    else:
        print("\n‚ùå Cannot initialize config - no vector store found!")
        print("Please check the diagnostic output above.\n")
except Exception as e:
    print(f"\n‚ùå Error initializing config: {e}\n")

üîç DIAGNOSING VECTOR STORE LOCATION

1Ô∏è‚É£ Searching for chroma.sqlite3 files...
‚úÖ Found 3 chroma.sqlite3 file(s):

   [1] /kaggle/input/embeddingstokens/mining_laws_vectorstore
       Files: chroma.sqlite3, 66b36f61-38a8-4106-8cce-42ce4d82bc07

   [2] /kaggle/input/embeddingstokens/mining_laws_vectorstore
       Files: chroma.sqlite3, 66b36f61-38a8-4106-8cce-42ce4d82bc07

   [3] /kaggle/input/embeddingstokens/mining_laws_vectorstore
       Files: chroma.sqlite3, 66b36f61-38a8-4106-8cce-42ce4d82bc07

‚úÖ Will use: /kaggle/input/embeddingstokens/mining_laws_vectorstore


üîß RAG SYSTEM CONFIGURATION
üìÅ Vector Store Path:
   /kaggle/input/embeddingstokens/mining_laws_vectorstore
   ‚úÖ Path exists
   ‚úÖ Contains 2 files
   ‚úÖ Found 1/3 required files

ü§ñ Embedding Model:    sentence-transformers/all-MiniLM-L6-v2
üß† LLM Provider:       groq
üîç Top-K Results:      5
üéØ Search Type:        similarity
üí¨ LLM Model:          llama-3.1-8b-instant
üå°Ô∏è  Temperature:     

In [4]:
# ============================================================================
# CELL 4: Set API Keys (IMPORTANT!)
# ============================================================================
print(" Setting up API keys...")
print("=" * 80)

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

try:
    # Get Groq API key from Kaggle Secrets
    GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")
    os.environ['GROQ_API_KEY'] = GROQ_API_KEY
    print(" Groq API key loaded from Kaggle Secrets")
except Exception as e:
    print(" Groq API key not found in Kaggle Secrets")
    print(f"   Error: {e}")
    print("\n Steps to add your API key:")
    print("1. Go to: https://console.groq.com/keys")
    print("2. Create/copy your API key")
    print("3. In Kaggle: Add-ons > Secrets > Add a new secret")
    print("4. Name it: GROQ_API_KEY")
    print("5. Toggle it ON (blue)")
    print("\n  Cannot proceed without API key!")
    raise

print()

 Setting up API keys...
 Groq API key loaded from Kaggle Secrets



In [12]:
# ============================================================================
# CELL 5: RAG System Class
# ============================================================================
class MiningLawsRAG:
    """Complete RAG system for mining laws"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        
        # Load embeddings model
        print(" Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=config.EMBEDDING_MODEL,
            model_kwargs={'device': 'cpu'}
        )
        print(" Embedding model loaded\n")
        
        # Load vector store
        self.vectorstore = self._load_vectorstore()
        
        # Initialize LLM
        self.llm = self._initialize_llm()
        
        # Create RAG chain
        self.qa_chain = self._create_rag_chain()
        
        # Query history
        self.query_history = []
    
    def _load_vectorstore(self) -> Chroma:
        """Load the vector store"""
        print(f" Loading vector store from: {self.config.VECTORSTORE_PATH}")
        
        if not os.path.exists(self.config.VECTORSTORE_PATH):
            raise FileNotFoundError(
                f" Vector store not found at {self.config.VECTORSTORE_PATH}\n"
                "Please ensure the embeddings dataset is added as input!"
            )
        
        vectorstore = Chroma(
            persist_directory=self.config.VECTORSTORE_PATH,
            embedding_function=self.embeddings,
            collection_name="mining_laws"
        )
        
        # Test the vectorstore
        collection = vectorstore._collection
        count = collection.count()
        print(f" Vector store loaded successfully!")
        print(f" Total document chunks: {count}\n")
        
        return vectorstore
    
    def _initialize_llm(self):
        """Initialize LLM based on provider"""
        print(f" Initializing LLM: {self.config.LLM_PROVIDER}")
        
        provider = self.config.LLM_PROVIDER
        config_dict = self.config.LLM_CONFIGS.get(provider, {})
        
        if provider == 'groq':
            api_key = os.getenv('GROQ_API_KEY')
            if not api_key:
                raise ValueError(
                    " GROQ_API_KEY not found!\n"
                    "Please set it in Kaggle Secrets or in the API Keys cell."
                )
            
            llm = ChatGroq(
                model=config_dict['model'],
                temperature=config_dict['temperature'],
                max_tokens=config_dict['max_tokens'],
                groq_api_key=api_key
            )
        
        elif provider == 'openai':
            from langchain_openai import ChatOpenAI
            
            api_key = os.getenv('OPENAI_API_KEY')
            if not api_key:
                raise ValueError(
                    " OPENAI_API_KEY not found!\n"
                    "Please set it in Kaggle Secrets or in the API Keys cell."
                )
            
            llm = ChatOpenAI(
                model=config_dict['model'],
                temperature=config_dict['temperature'],
                max_tokens=config_dict['max_tokens'],
                openai_api_key=api_key
            )
        
        else:
            raise ValueError(f"Unsupported LLM provider: {provider}")
        
        print(f" LLM initialized: {config_dict['model']}\n")
        return llm
    
    def _create_rag_chain(self):
        """Create the RAG chain"""
        print(" Creating RAG chain...")
        
        # Custom prompt template
        template = """You are an expert AI assistant specialized in Indian mining laws and regulations.
        Your role is to provide accurate, comprehensive, and actionable legal analysis based ONLY on the mining legislation documents provided.
        
        Context from mining law documents:
        {context}
        
        Question: {question}
        
        CRITICAL INSTRUCTIONS:
        1. Answer ONLY based on the context provided above
        2. Provide a polished, professional analysis with clear structure
        3. Cite specific acts, sections, regulations, or circulars with exact names and numbers
        4. If the answer is not in the context, clearly state: "I don't have sufficient information about this in the available documents."
        5. Use proper legal terminology while keeping language clear and accessible
        
        RESPONSE STRUCTURE:
        
        For Safety Violation/Incident Analysis:
        
        **INCIDENT OVERVIEW**
        Provide a brief 2-3 line summary of the situation and its severity.
        
        **LEGAL VIOLATIONS IDENTIFIED**
        List each violation clearly with:
        - Specific regulation/section violated
        - What was required by law
        - What actually occurred (the breach)
        - Risk/consequence of the violation
        
        Format: [Regulation] Rule/Section [Number] - [Title/Subject]
        Required: [What the law mandates]
        Violation: [What was not done/wrongly done]
        Risk: [Safety/legal consequences]
        
        **APPLICABLE LEGAL FRAMEWORK**
        Organize by hierarchy:
        1. Primary Legislation (e.g., Mines Act 1952)
           ‚Ä¢ Relevant sections with brief descriptions
           
        2. Subordinate Regulations (e.g., MMR 1961, CMR 2017)
           ‚Ä¢ Relevant rules with specific requirements
           
        3. DGMS Circulars/Technical Guidelines (if mentioned)
           ‚Ä¢ Relevant circulars with procedural mandates
        
        **ROOT CAUSE ANALYSIS**
        Identify systemic failures:
        - Procedural gaps (what processes failed)
        - Supervisory lapses (who failed to ensure compliance)
        - Communication breakdowns (where information flow failed)
        - Training/competency issues (if applicable)
        
        **LEGAL CONSEQUENCES & LIABILITIES**
        - Employer/Management Responsibilities under the law
        - Specific penalty provisions (fines, imprisonment terms if available)
        - Potential charges under different sections
        - Liability of specific roles (Manager, Supervisor, Blaster, etc.)
        
        **CORRECTIVE ACTIONS REQUIRED**
        Provide specific, actionable recommendations:
        
        Immediate Actions:
        - [Specific action 1 with timeline]
        - [Specific action 2 with timeline]
        
        Systemic Improvements:
        - [Process improvement 1]
        - [Training/awareness initiative 2]
        - [Safety system enhancement 3]
        
        Compliance Verification:
        - [Audit/inspection requirement]
        - [Documentation to be maintained]
        - [Reporting obligations to DGMS]
        
        **PREVENTIVE MEASURES**
        Suggest forward-looking safety enhancements:
        - Standard Operating Procedures (SOPs) to implement
        - Safety equipment/systems to install
        - Training programs to conduct
        - Monitoring mechanisms to establish
        
        ---
        
        For General Legal Queries (Non-Violation):
        
        **DIRECT ANSWER**
        Provide clear, direct response to the question in 2-4 sentences.
        
        **LEGAL BASIS**
        - [Act/Regulation] - [Section Number]: [Specific provision]
        - [Supporting regulation/rule with explanation]
        
        **DETAILED EXPLANATION**
        - Elaborate on the legal requirements
        - Explain the rationale behind the provision
        - Clarify any technical terms or procedures
        
        **COMPLIANCE REQUIREMENTS**
        - What must be done (mandatory requirements)
        - Who is responsible
        - Timelines/frequencies if applicable
        - Documentation needed
        
        **PRACTICAL GUIDANCE**
        - Step-by-step compliance approach
        - Common pitfalls to avoid
        - Best practices from the regulations
        
        **RELATED PROVISIONS** (if applicable)
        - Cross-references to connected regulations
        - Additional considerations
        
        ---
        
        FORMATTING GUIDELINES:
        - Use **bold** for main headings
        - Use bullet points (‚Ä¢) for clarity
        - Keep paragraphs concise (3-4 lines max)
        - Distinguish between:
          ‚úì Mandatory requirements ("shall", "must") 
          ‚úì Recommended practices ("should", "may")
          ‚úì Prohibitions ("shall not", "must not")
        - Use tables for comparing multiple requirements (if needed)
        - Highlight critical safety points with ‚ö†Ô∏è WARNING where severe risks exist
        
        TONE:
        - Professional and authoritative
        - Clear and actionable
        - Balanced (cite law, explain risk, suggest solutions)
        - Avoid legal jargon where simpler terms suffice
        - Be empathetic to safety concerns while being legally precise
        
        If information is insufficient, acknowledge limitations clearly and suggest what additional documents/clarifications would help.
        
        Detailed Answer:"""
        
        prompt = PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )
        
        # Create retriever
        retriever = self.vectorstore.as_retriever(
            search_type=self.config.SEARCH_TYPE,
            search_kwargs={"k": self.config.TOP_K_RESULTS}
        )
        
        # Create QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs={"prompt": prompt}
        )
        
        print(" RAG chain created\n")
        return qa_chain
    
    def query(self, question: str, show_sources: bool = True) -> Dict:
        """Query the RAG system"""
        
        print("\n" + "="*80)
        print(f" QUESTION: {question}")
        print("="*80 + "\n")
        print(" Searching documents and generating answer...\n")
        
        try:
            # Get answer from RAG chain
            result = self.qa_chain({"query": question})
            
            answer = result['result']
            source_docs = result['source_documents']
            
            # Format answer
            print(" ANSWER:")
            print("-" * 80)
            print(answer)
            print("-" * 80 + "\n")
            
            # Show sources
            if show_sources:
                self._display_sources(source_docs)
            
            # Create result object
            query_result = {
                'question': question,
                'answer': answer,
                'sources': [
                    {
                        'source': doc.metadata.get('source', 'Unknown'),
                        'doc_type': doc.metadata.get('doc_type', 'Unknown'),
                        'subject': doc.metadata.get('subject', 'Unknown'),
                        'year': doc.metadata.get('year', 'Unknown'),
                        'chunk_id': doc.metadata.get('chunk_id', 'Unknown')
                    }
                    for doc in source_docs
                ],
                'timestamp': datetime.now().isoformat(),
                'success': True
            }
            
            # Save to history
            self.query_history.append(query_result)
            
            return query_result
            
        except Exception as e:
            error_msg = f"Error occurred: {str(e)}"
            print(f"\n {error_msg}\n")
            
            query_result = {
                'question': question,
                'answer': error_msg,
                'sources': [],
                'timestamp': datetime.now().isoformat(),
                'success': False,
                'error': str(e)
            }
            
            return query_result
    
    def _display_sources(self, source_docs: List[Document]):
        """Display source documents"""
        print(" SOURCES:")
        print("="*80)
        
        seen_sources = set()
        for idx, doc in enumerate(source_docs, 1):
            source_key = f"{doc.metadata.get('source', 'Unknown')}-{doc.metadata.get('chunk_id', idx)}"
            
            if source_key not in seen_sources:
                print(f"\n[{idx}] {doc.metadata.get('source', 'Unknown')}")
                print(f"    Type: {doc.metadata.get('doc_type', 'Unknown')}")
                print(f"    Subject: {doc.metadata.get('subject', 'Unknown')}")
                print(f"    Year: {doc.metadata.get('year', 'Unknown')}")
                print(f"    Chunk: {doc.metadata.get('chunk_id', '?')}/{doc.metadata.get('chunk_total', '?')}")
                print(f"    Preview: {doc.page_content[:150]}...")
                seen_sources.add(source_key)
        
        print("\n" + "="*80 + "\n")
    
    def get_statistics(self) -> Dict:
        """Get query statistics"""
        if not self.query_history:
            return {"message": "No queries yet"}
        
        successful = sum(1 for q in self.query_history if q['success'])
        failed = len(self.query_history) - successful
        
        return {
            'total_queries': len(self.query_history),
            'successful': successful,
            'failed': failed,
            'success_rate': f"{(successful/len(self.query_history)*100):.1f}%"
        }
    
    def save_query_log(self):
        """Save all queries to JSON file"""
        if not self.query_history:
            print("  No queries to save")
            return
        
        log_path = os.path.join(self.config.OUTPUT_DIR, "queries_log.json")
        with open(log_path, 'w') as f:
            json.dump(self.query_history, f, indent=2)
        
        print(f" Query log saved to: {log_path}")

In [13]:
# ============================================================================
# CELL 6: Initialize RAG System
# ============================================================================
print("\n" + "="*80)
print("  MINING LAWS RAG SYSTEM - INITIALIZING")
print("="*80)
print(f" Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    rag_system = MiningLawsRAG(config)
    print(" RAG system initialized successfully!\n")
except Exception as e:
    print(f" Failed to initialize RAG system: {e}")
    raise


  MINING LAWS RAG SYSTEM - INITIALIZING
 Started: 2025-11-14 19:13:23

 Loading embedding model...


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


 Embedding model loaded

 Loading vector store from: /kaggle/input/embeddingstokens/mining_laws_vectorstore
 Vector store loaded successfully!
 Total document chunks: 11109

 Initializing LLM: groq
 LLM initialized: llama-3.1-8b-instant

 Creating RAG chain...
 RAG chain created

 RAG system initialized successfully!



In [14]:
# ============================================================================
# CELL 7: Sample Questions (Modify or add your own!)
# ============================================================================
print("\n" + "="*80)
print(" RUNNING SAMPLE QUESTIONS")
print("="*80 + "\n")

# Modify these questions or add your own
sample_questions = [
    "Analyze this mining safety incident: At an open-cast mine, blasting was conducted at 4:30 PM. Workers in the drilling zone were not notified through siren/horn signals. One contract worker was still within 100 meters of the blasting zone. What are the legal violations and applicable regulations?",
    
    "Evaluate this workplace safety issue: A supervisor instructs an operator to work on a bench that has become 85¬∞ steep due to rain collapse. No geotechnical inspection has been done. What laws and rules are being violated?",
    
    "Assess this employment compliance matter: A 16-year-old contract helper is assisting a drill operator in a deep opencast mine. Identify the legal violations and relevant sections of mining laws.",
    
    "Review this environmental compliance case: A mine is producing 1.5 MTPA limestone but expansion clearance (EC) hasn't been renewed for the last 1 year. What are the legal implications and violations?",
    
    "Examine this health and safety scenario: A mine employs 240 workers per shift but runs only one first-aid box. No first-aid room exists. What are the regulatory violations?",
    
    "Analyze this environmental violation: A mine temporarily diverts a natural seasonal stream to develop a haul road without forest/water authority approval. What laws are being breached?",
    
    "Evaluate this storage compliance issue: A mine stores 6,000 litres of diesel in barrels near workshop without PESO license. Identify the legal violations and applicable acts.",
    
    "Assess this underground mining safety incident: A miner enters an unsupported area while cleaning spillage in an underground room-and-pillar mine. What are the safety regulation violations?",
    
    "Review this occupational health matter: Operators using DTH drill machines are not provided ear protection even though noise level is 106 dB. What are the legal violations?",
    
    "Examine this dumping safety violation: Material is being dumped over the edge without maintaining the 10m safety barrier from dump edge. What rules and regulations are being violated?"
]


# Run all sample questions
results = []
for question in sample_questions:
    result = rag_system.query(question, show_sources=True)
    results.append(result)
    print("\n" + "="*80 + "\n")


 RUNNING SAMPLE QUESTIONS


 QUESTION: Analyze this mining safety incident: At an open-cast mine, blasting was conducted at 4:30 PM. Workers in the drilling zone were not notified through siren/horn signals. One contract worker was still within 100 meters of the blasting zone. What are the legal violations and applicable regulations?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**INCIDENT OVERVIEW**

At an open-cast mine, blasting was conducted at 4:30 PM without proper notification to workers in the drilling zone through siren/horn signals. A contract worker remained within 100 meters of the blasting zone, putting their life at risk.

**LEGAL VIOLATIONS IDENTIFIED**

1. **Regulation 160(1) of the Metalliferous Mines Regulations, 1961** - **Notification of Blasting Operations**
   Required: The charging of holes must be carried out by or under the personal supervision of a competent person/bla

In [15]:
# ============================================================================
# CELL 8: Statistics & Save Results
# ============================================================================
print("\n" + "="*80)
print(" SESSION STATISTICS")
print("="*80)

stats = rag_system.get_statistics()
for key, value in stats.items():
    print(f"{key}: {value}")

print("="*80 + "\n")

# Save query log
rag_system.save_query_log()

print(f" Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")


 SESSION STATISTICS
total_queries: 10
successful: 10
failed: 0
success_rate: 100.0%

 Query log saved to: /kaggle/working/queries_log.json
 Completed: 2025-11-14 19:16:53



In [16]:
# ============================================================================
# CELL 9: Interactive Query (Optional)
# ============================================================================



# Ask your own question
my_question = "What are the penalties for violating mining safety regulations?"

result = rag_system.query(my_question, show_sources=True)

# Access the answer
print("\nYour Answer:")
print(result['answer'])


 QUESTION: What are the penalties for violating mining safety regulations?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**INCIDENT OVERVIEW**

A mining accident occurred due to non-compliance with safety regulations, resulting in potential harm to mine workers. The incident highlights the importance of adhering to safety protocols to prevent accidents.

**LEGAL VIOLATIONS IDENTIFIED**

‚Ä¢ **Mines Act 1952** - **Section 21**: The mine management failed to ensure that all operations in the mine were done in accordance with the provisions of the Mines Act and Regulations, Rules, bye-laws and orders made there under.
  Required: Compliance with safety regulations
  Violation: Failure to ensure compliance
  Risk: Potential harm to mine workers

‚Ä¢ **Metalliferous Mines Regulations, 1961** - **Regulation 116**: The mine management failed to exercise personal supervision to ensure that all operatio

In [17]:
# ============================================================================
# CELL 10: Batch Query Function (Optional)
# ============================================================================


def batch_query(questions_list):
    """
    Query multiple questions at once
    
    Usage:
        my_questions = [
            "Question 1?",
            "Question 2?",
            "Question 3?"
        ]
        batch_results = batch_query(my_questions)
    """
    print(f"\n Running batch queries ({len(questions_list)} questions)...\n")
    
    batch_results = []
    for i, question in enumerate(questions_list, 1):
        print(f"\n{'='*80}")
        print(f"Query {i}/{len(questions_list)}")
        print(f"{'='*80}")
        
        result = rag_system.query(question, show_sources=True)
        batch_results.append(result)
    
    return batch_results

# Example usage :

custom_questions = [
    "What are the environmental regulations for mining?",
    "What are the requirements for mine closure?",
    "What are the provisions for compensation in case of mining accidents?"
]

batch_results = batch_query(custom_questions)


 Running batch queries (3 questions)...


Query 1/3

 QUESTION: What are the environmental regulations for mining?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**ENVIRONMENTAL REGULATIONS FOR MINING**

**DIRECT ANSWER**

The environmental regulations for mining in India are governed by various laws and regulations, including the Mines Act, 1952, and the Coal Mines (Conservation and Development) Rules, 1975. These regulations focus on environmental management, mine closure planning, rehabilitation and resettlement, and health and safety standards.

**LEGAL BASIS**

* The Mines Act, 1952 - Section 5: Requires mine owners to take measures for the prevention of accidents and the protection of the health and safety of workers.
* The Coal Mines (Conservation and Development) Rules, 1975 - Rule 12(1)(ii)(d): Regulates the construction of dams, and Rule 12(1)(ii)(e): Regulates artificial barriers.
* T

In [18]:
# ============================================================================
# END OF NOTEBOOK
# ============================================================================
print("\n" + "="*80)
print(" NOTEBOOK COMPLETED SUCCESSFULLY!")
print("="*80)
print("\n Next Steps:")
print("   1. Review the answers above")
print("   2. Modify sample questions in Cell 7")
print("   3. Use Cell 9 for single custom questions")
print("   4. Use Cell 10 for batch questions")
print("   5. Check /kaggle/working/queries_log.json for saved results")
print("\n" + "="*80 + "\n")


 NOTEBOOK COMPLETED SUCCESSFULLY!

 Next Steps:
   1. Review the answers above
   2. Modify sample questions in Cell 7
   3. Use Cell 9 for single custom questions
   4. Use Cell 10 for batch questions
   5. Check /kaggle/working/queries_log.json for saved results


