"""
Mining Laws RAG System - Kaggle Notebook Version
================================================
Description: Question answering system for Indian mining laws using RAG
Dataset: https://www.kaggle.com/code/stacky123/embeddings-tokens

Instructions:
1. Add the embeddings dataset as input to your notebook
2. Install required packages (first cell)
3. Run all cells
4. Use interactive mode or modify sample questions
"""

In [None]:
# ============================================================================
# CELL 1: Install Required Packages
# ============================================================================
print(" Installing required packages...")
print("=" * 80)

# Uninstall conflicting versions first
!pip uninstall -y langchain langchain-community langchain-core -q

# Install compatible versions
!pip install -q langchain==0.1.20
!pip install -q langchain-community==0.0.38
!pip install -q langchain-core==0.1.52
!pip install -q chromadb==0.4.24
!pip install -q sentence-transformers==2.3.1
!pip install -q langchain-groq==0.1.3
!pip install -q groq

print(" Packages installed successfully!")
print("\n  Please RESTART the kernel after installation!")
print("   Kernel â†’ Restart & Run All\n")

 Installing required packages...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [2]:
# ============================================================================
# CELL 2: Import Libraries
# ============================================================================
print("ðŸ“š Importing libraries...")

import os
import sys
import json
import warnings
from typing import List, Dict, Optional
from datetime import datetime

warnings.filterwarnings('ignore')

# LangChain imports
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import Document

# LLM imports
from langchain_groq import ChatGroq

print("âœ… Libraries imported successfully!\n")

ðŸ“š Importing libraries...
âœ… Libraries imported successfully!



In [12]:
# ============================================================================
# CELL 3: Configuration
# ============================================================================
class RAGConfig:
    """Configuration for RAG system"""
    
    def __init__(self):
        # Kaggle input path - adjust if your dataset is named differently
        self.VECTORSTORE_PATH = "/kaggle/input/embeddingstokens/mining_laws_vectorstore"
        
        # Embedding model (must match the one used to create embeddings)
        self.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
        
        # LLM settings
        self.LLM_PROVIDER = "groq"  
        
        # Retrieval settings
        self.TOP_K_RESULTS = 5
        self.SEARCH_TYPE = "similarity"  # or "mmr"
        
        # LLM configurations
        self.LLM_CONFIGS = {
            'groq': {
                'model': 'llama-3.1-8b-instant',
                'temperature': 0.3,
                'max_tokens': 1024
            },
            'openai': {
                'model': 'gpt-4-turbo-preview',
                'temperature': 0.3,
                'max_tokens': 1024
            }
        }
        
        # Output paths
        self.OUTPUT_DIR = "/kaggle/working"
        self.QUERIES_LOG = os.path.join(self.OUTPUT_DIR, "queries_log.json")
    
    def print_config(self):
        """Print configuration"""
        print("\n" + "="*80)
        print("  RAG SYSTEM CONFIGURATION")
        print("="*80)
        print(f" Vector Store:       {self.VECTORSTORE_PATH}")
        print(f" Embedding Model:    {self.EMBEDDING_MODEL}")
        print(f" LLM Provider:       {self.LLM_PROVIDER}")
        print(f" Top-K Results:      {self.TOP_K_RESULTS}")
        print(f" Search Type:        {self.SEARCH_TYPE}")
        
        if self.LLM_PROVIDER in self.LLM_CONFIGS:
            config = self.LLM_CONFIGS[self.LLM_PROVIDER]
            print(f" LLM Model:          {config.get('model', 'N/A')}")
            print(f"  Temperature:        {config.get('temperature', 'N/A')}")
        
        print("="*80 + "\n")


# Initialize configuration
config = RAGConfig()
config.print_config()


  RAG SYSTEM CONFIGURATION
 Vector Store:       /kaggle/input/embeddingstokens/mining_laws_vectorstore
 Embedding Model:    sentence-transformers/all-MiniLM-L6-v2
 LLM Provider:       groq
 Top-K Results:      5
 Search Type:        similarity
 LLM Model:          llama-3.1-8b-instant
  Temperature:        0.3



In [13]:
# ============================================================================
# CELL 4: Set API Keys (IMPORTANT!)
# ============================================================================
print(" Setting up API keys...")
print("=" * 80)

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

try:
    # Get Groq API key from Kaggle Secrets
    GROQ_API_KEY = user_secrets.get_secret("GROQ_API_KEY")
    os.environ['GROQ_API_KEY'] = GROQ_API_KEY
    print(" Groq API key loaded from Kaggle Secrets")
except Exception as e:
    print(" Groq API key not found in Kaggle Secrets")
    print(f"   Error: {e}")
    print("\n Steps to add your API key:")
    print("1. Go to: https://console.groq.com/keys")
    print("2. Create/copy your API key")
    print("3. In Kaggle: Add-ons > Secrets > Add a new secret")
    print("4. Name it: GROQ_API_KEY")
    print("5. Toggle it ON (blue)")
    print("\n  Cannot proceed without API key!")
    raise

print()

 Setting up API keys...
 Groq API key loaded from Kaggle Secrets



In [14]:
# ============================================================================
# CELL 5: RAG System Class
# ============================================================================
class MiningLawsRAG:
    """Complete RAG system for mining laws"""
    
    def __init__(self, config: RAGConfig):
        self.config = config
        
        # Load embeddings model
        print(" Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=config.EMBEDDING_MODEL,
            model_kwargs={'device': 'cpu'}
        )
        print(" Embedding model loaded\n")
        
        # Load vector store
        self.vectorstore = self._load_vectorstore()
        
        # Initialize LLM
        self.llm = self._initialize_llm()
        
        # Create RAG chain
        self.qa_chain = self._create_rag_chain()
        
        # Query history
        self.query_history = []
    
    def _load_vectorstore(self) -> Chroma:
        """Load the vector store"""
        print(f" Loading vector store from: {self.config.VECTORSTORE_PATH}")
        
        if not os.path.exists(self.config.VECTORSTORE_PATH):
            raise FileNotFoundError(
                f" Vector store not found at {self.config.VECTORSTORE_PATH}\n"
                "Please ensure the embeddings dataset is added as input!"
            )
        
        vectorstore = Chroma(
            persist_directory=self.config.VECTORSTORE_PATH,
            embedding_function=self.embeddings,
            collection_name="mining_laws"
        )
        
        # Test the vectorstore
        collection = vectorstore._collection
        count = collection.count()
        print(f" Vector store loaded successfully!")
        print(f" Total document chunks: {count}\n")
        
        return vectorstore
    
    def _initialize_llm(self):
        """Initialize LLM based on provider"""
        print(f" Initializing LLM: {self.config.LLM_PROVIDER}")
        
        provider = self.config.LLM_PROVIDER
        config_dict = self.config.LLM_CONFIGS.get(provider, {})
        
        if provider == 'groq':
            api_key = os.getenv('GROQ_API_KEY')
            if not api_key:
                raise ValueError(
                    " GROQ_API_KEY not found!\n"
                    "Please set it in Kaggle Secrets or in the API Keys cell."
                )
            
            llm = ChatGroq(
                model=config_dict['model'],
                temperature=config_dict['temperature'],
                max_tokens=config_dict['max_tokens'],
                groq_api_key=api_key
            )
        
        elif provider == 'openai':
            from langchain_openai import ChatOpenAI
            
            api_key = os.getenv('OPENAI_API_KEY')
            if not api_key:
                raise ValueError(
                    " OPENAI_API_KEY not found!\n"
                    "Please set it in Kaggle Secrets or in the API Keys cell."
                )
            
            llm = ChatOpenAI(
                model=config_dict['model'],
                temperature=config_dict['temperature'],
                max_tokens=config_dict['max_tokens'],
                openai_api_key=api_key
            )
        
        else:
            raise ValueError(f"Unsupported LLM provider: {provider}")
        
        print(f" LLM initialized: {config_dict['model']}\n")
        return llm
    
    def _create_rag_chain(self):
        """Create the RAG chain"""
        print(" Creating RAG chain...")
        
        # Custom prompt template
        template =   """You are an expert AI assistant specialized in Indian mining laws and regulations.
                        Your role is to provide accurate, detailed answers based ONLY on the mining legislation documents provided.
                        
                        Context from mining law documents:
                        {context}
                        
                        Question: {question}
                        
                        CRITICAL INSTRUCTIONS:
                        1. Answer ONLY based on the context provided above
                        2. Cite specific acts, sections, regulations, or circulars with exact names
                        3. If the answer is not in the context, clearly state: "I don't have information about this in the available documents."
                        4. Be precise and use proper legal terminology
                        5. Structure your answer clearly with:
                           - Direct answer first
                           - Relevant legal references
                           - Additional context if helpful
                        6. If multiple documents address the question, mention all relevant sources
                        7. For safety or compliance questions, emphasize the specific requirements
                        
                        Detailed Answer:"""
        
        prompt = PromptTemplate(
            template=template,
            input_variables=["context", "question"]
        )
        
        # Create retriever
        retriever = self.vectorstore.as_retriever(
            search_type=self.config.SEARCH_TYPE,
            search_kwargs={"k": self.config.TOP_K_RESULTS}
        )
        
        # Create QA chain
        qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs={"prompt": prompt}
        )
        
        print(" RAG chain created\n")
        return qa_chain
    
    def query(self, question: str, show_sources: bool = True) -> Dict:
        """Query the RAG system"""
        
        print("\n" + "="*80)
        print(f" QUESTION: {question}")
        print("="*80 + "\n")
        print(" Searching documents and generating answer...\n")
        
        try:
            # Get answer from RAG chain
            result = self.qa_chain({"query": question})
            
            answer = result['result']
            source_docs = result['source_documents']
            
            # Format answer
            print(" ANSWER:")
            print("-" * 80)
            print(answer)
            print("-" * 80 + "\n")
            
            # Show sources
            if show_sources:
                self._display_sources(source_docs)
            
            # Create result object
            query_result = {
                'question': question,
                'answer': answer,
                'sources': [
                    {
                        'source': doc.metadata.get('source', 'Unknown'),
                        'doc_type': doc.metadata.get('doc_type', 'Unknown'),
                        'subject': doc.metadata.get('subject', 'Unknown'),
                        'year': doc.metadata.get('year', 'Unknown'),
                        'chunk_id': doc.metadata.get('chunk_id', 'Unknown')
                    }
                    for doc in source_docs
                ],
                'timestamp': datetime.now().isoformat(),
                'success': True
            }
            
            # Save to history
            self.query_history.append(query_result)
            
            return query_result
            
        except Exception as e:
            error_msg = f"Error occurred: {str(e)}"
            print(f"\n {error_msg}\n")
            
            query_result = {
                'question': question,
                'answer': error_msg,
                'sources': [],
                'timestamp': datetime.now().isoformat(),
                'success': False,
                'error': str(e)
            }
            
            return query_result
    
    def _display_sources(self, source_docs: List[Document]):
        """Display source documents"""
        print(" SOURCES:")
        print("="*80)
        
        seen_sources = set()
        for idx, doc in enumerate(source_docs, 1):
            source_key = f"{doc.metadata.get('source', 'Unknown')}-{doc.metadata.get('chunk_id', idx)}"
            
            if source_key not in seen_sources:
                print(f"\n[{idx}] {doc.metadata.get('source', 'Unknown')}")
                print(f"    Type: {doc.metadata.get('doc_type', 'Unknown')}")
                print(f"    Subject: {doc.metadata.get('subject', 'Unknown')}")
                print(f"    Year: {doc.metadata.get('year', 'Unknown')}")
                print(f"    Chunk: {doc.metadata.get('chunk_id', '?')}/{doc.metadata.get('chunk_total', '?')}")
                print(f"    Preview: {doc.page_content[:150]}...")
                seen_sources.add(source_key)
        
        print("\n" + "="*80 + "\n")
    
    def get_statistics(self) -> Dict:
        """Get query statistics"""
        if not self.query_history:
            return {"message": "No queries yet"}
        
        successful = sum(1 for q in self.query_history if q['success'])
        failed = len(self.query_history) - successful
        
        return {
            'total_queries': len(self.query_history),
            'successful': successful,
            'failed': failed,
            'success_rate': f"{(successful/len(self.query_history)*100):.1f}%"
        }
    
    def save_query_log(self):
        """Save all queries to JSON file"""
        if not self.query_history:
            print("  No queries to save")
            return
        
        log_path = os.path.join(self.config.OUTPUT_DIR, "queries_log.json")
        with open(log_path, 'w') as f:
            json.dump(self.query_history, f, indent=2)
        
        print(f" Query log saved to: {log_path}")

In [15]:
# ============================================================================
# CELL 6: Initialize RAG System
# ============================================================================
print("\n" + "="*80)
print("  MINING LAWS RAG SYSTEM - INITIALIZING")
print("="*80)
print(f" Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

try:
    rag_system = MiningLawsRAG(config)
    print(" RAG system initialized successfully!\n")
except Exception as e:
    print(f" Failed to initialize RAG system: {e}")
    raise


  MINING LAWS RAG SYSTEM - INITIALIZING
 Started: 2025-11-12 10:58:04

 Loading embedding model...
 Embedding model loaded

 Loading vector store from: /kaggle/input/embeddingstokens/mining_laws_vectorstore
 Vector store loaded successfully!
 Total document chunks: 11109

 Initializing LLM: groq
 LLM initialized: llama-3.1-8b-instant

 Creating RAG chain...
 RAG chain created

 RAG system initialized successfully!



In [16]:
# ============================================================================
# CELL 7: Sample Questions (Modify or add your own!)
# ============================================================================
print("\n" + "="*80)
print(" RUNNING SAMPLE QUESTIONS")
print("="*80 + "\n")

# Modify these questions or add your own
sample_questions = [
    "What are the main provisions of the Mines Act 1952?",
    "What is the minimum age for employment in mines?",
    "What are the safety requirements for underground coal mines?",
    "What is DGMS and what are its functions?",
    "What are the provisions for mine workers' welfare?"
]

# Run all sample questions
results = []
for question in sample_questions:
    result = rag_system.query(question, show_sources=True)
    results.append(result)
    print("\n" + "="*80 + "\n")


 RUNNING SAMPLE QUESTIONS


 QUESTION: What are the main provisions of the Mines Act 1952?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**Main Provisions of the Mines Act 1952:**

The Mines Act 1952 is an Act to amend and consolidate the law relating to the regulation of labour and safety in mines. The main provisions of the Act include:

1. **Regulation of Labour and Safety**: The Act aims to regulate labour and safety in mines to ensure the well-being of persons employed in the mines.
2. **Application of the Act**: The Act applies to the whole of India and comes into force on a date appointed by the Central Government (Section 1, Mines Act 1952).
3. **Definition of Mine**: A mine is defined as any operation for the purpose of searching for or obtaining minerals from the earth (Section 2, Mines Act 1952).
4. **Responsibility of Mine Owners**: Mine owners are responsible for ensuring the safet

In [17]:
# ============================================================================
# CELL 8: Statistics & Save Results
# ============================================================================
print("\n" + "="*80)
print(" SESSION STATISTICS")
print("="*80)

stats = rag_system.get_statistics()
for key, value in stats.items():
    print(f"{key}: {value}")

print("="*80 + "\n")

# Save query log
rag_system.save_query_log()

print(f" Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")


 SESSION STATISTICS
total_queries: 5
successful: 5
failed: 0
success_rate: 100.0%

 Query log saved to: /kaggle/working/queries_log.json
 Completed: 2025-11-12 10:58:33



In [19]:
# ============================================================================
# CELL 9: Interactive Query (Optional)
# ============================================================================
# Uncomment the code below to ask your own questions


# Ask your own question
my_question = "What are the penalties for violating mining safety regulations?"

result = rag_system.query(my_question, show_sources=True)

# Access the answer
print("\nYour Answer:")
print(result['answer'])



 QUESTION: What are the penalties for violating mining safety regulations?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**Penalties for Violating Mining Safety Regulations:**

**Direct Answer:**
The penalties for violating mining safety regulations include imprisonment for a term up to two years, a minimum fine of one lakh rupees per day, and a maximum fine of two lakh rupees for every day during which the failure continues.

**Relevant Legal References:**
This is as per Section 23 of the mining legislation documents, which states that if any person obstructs or causes any impediment in taking possession or in the management and operation of the Schedule I coal mines, fails to deliver books of account, registers, or any other document, destroys or misuses mine infrastructure or coal stock, or retains any property of such coal mine or removes or destroys it, they shall be punishable with the af

In [20]:
# ============================================================================
# CELL 10: Batch Query Function (Optional)
# ============================================================================
# Use this to query multiple questions at once

def batch_query(questions_list):
    """
    Query multiple questions at once
    
    Usage:
        my_questions = [
            "Question 1?",
            "Question 2?",
            "Question 3?"
        ]
        batch_results = batch_query(my_questions)
    """
    print(f"\n Running batch queries ({len(questions_list)} questions)...\n")
    
    batch_results = []
    for i, question in enumerate(questions_list, 1):
        print(f"\n{'='*80}")
        print(f"Query {i}/{len(questions_list)}")
        print(f"{'='*80}")
        
        result = rag_system.query(question, show_sources=True)
        batch_results.append(result)
    
    return batch_results

# Example usage :

custom_questions = [
    "What are the environmental regulations for mining?",
    "What are the requirements for mine closure?",
    "What are the provisions for compensation in case of mining accidents?"
]

batch_results = batch_query(custom_questions)



 Running batch queries (3 questions)...


Query 1/3

 QUESTION: What are the environmental regulations for mining?

 Searching documents and generating answer...

 ANSWER:
--------------------------------------------------------------------------------
**Environmental Regulations for Mining:**

The environmental regulations for mining are governed by the following provisions:

**Direct Answer:**

The environmental regulations for mining include mine environment monitoring and control, Environment Management Plan (EMP), mine closure plan, and rehabilitation and resettlement (R&R).

**Relevant Legal References:**

* The Mines Act, 1952
* Mines Rules 1955
* Coal Mine Regulation, 1957
* Mines Rescue Rules, 1985
* Provisions of Indian Electricity Rules, 1956 applicable to mines
* Vocational Training Rules, 1966
* Other rules and legislation applicable to opencast metalliferous mines and coal mines

**Additional Context:**

The Environmental Management Plan (EMP) is a crucial document that 

In [21]:
# ============================================================================
# END OF NOTEBOOK
# ============================================================================
print("\n" + "="*80)
print(" NOTEBOOK COMPLETED SUCCESSFULLY!")
print("="*80)
print("\n Next Steps:")
print("   1. Review the answers above")
print("   2. Modify sample questions in Cell 7")
print("   3. Use Cell 9 for single custom questions")
print("   4. Use Cell 10 for batch questions")
print("   5. Check /kaggle/working/queries_log.json for saved results")
print("\n" + "="*80 + "\n")


 NOTEBOOK COMPLETED SUCCESSFULLY!

 Next Steps:
   1. Review the answers above
   2. Modify sample questions in Cell 7
   3. Use Cell 9 for single custom questions
   4. Use Cell 10 for batch questions
   5. Check /kaggle/working/queries_log.json for saved results


