In [1]:
!pip install langchain-google-genai tqdm

Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.5-py3-none-any.whl.metadata (5.2 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Downloading langchain_google_genai-2.1.5-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: filetype, google-ai-generativelanguage, langchain-google-genai
  Attempting uninstall: google-ai-generativelangu

In [2]:
import json
import re
from typing import List, Dict, Any
from tqdm import tqdm
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate

# Replace with your actual Google API key
GOOGLE_API_KEY = "AIzaSyBbnuX8-8U3ItjOX53TbIa4lKirt4SwmeE"

print("✅ Libraries imported successfully!")
print("🔑 API Key configured")
print("📝 Ready for NER + Clause Extraction")

✅ Libraries imported successfully!
🔑 API Key configured
📝 Ready for NER + Clause Extraction


In [3]:
import json
import re
from typing import List, Dict, Any
from tqdm import tqdm
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate

class PureNERClauseExtractor:
    def __init__(self, api_key: str):
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-2.0-flash",
            google_api_key=api_key,
            temperature=0.1,
            max_output_tokens=3000
        )

        # Pure clause extraction prompt
        self.clause_extraction_prompt = PromptTemplate(
            input_variables=["document"],
            template="""
Extract all legal clauses from this document. A clause is any distinct legal statement, provision, or section.

DOCUMENT:
{document}

Return ONLY a JSON array of clauses. Each clause should be a separate legal statement or provision.

Example format:
[
  "First legal clause text here",
  "Second legal clause text here",
  "Third legal clause text here"
]

Extract ALL clauses - don't miss any. Be comprehensive.
"""
        )
        self.ner_prompt = PromptTemplate(
          input_variables=["text"],
          template="""
You are a legal NER and entity extraction engine.

Analyze the legal text below and extract **all named entities** including:

------------------------
🏷️ MANDATORY ENTITY TYPES TO EXTRACT:
------------------------
• Person (e.g., Mr. Arjun Mehra, Adv. R.K. Sharma, Harish Saini)
• Organization (e.g., ICICI Bank, Delhi High Court)
• Legal Section (e.g., Section 420 IPC)
• Law or Act (e.g., Indian Penal Code, Negotiable Instruments Act)
• Date (any format)
• Amount (any format, e.g., ₹7,50,000 or Seven Lakhs)
• Duration (e.g., 3 months, 15 days)
• Percentage (e.g., 12%)
• Payment Method (e.g., NEFT, Cheque)
• Reference Number, Account Number, Case Number
• Court (e.g., competent court)
• Legal Action (e.g., Criminal breach of trust, Civil Suit)
• Reason (e.g., Cheating, Dishonour, Hardship)
• Address, Email, Phone (if present)

------------------------
🧠 SPECIAL RULE FOR PERSON:
------------------------
- Extract **every full name** that is a person.
- Look for cues like: Mr., Mrs., Ms., Miss, Dr., Adv., Advocate, S/o, W/o, etc.
- Even if a name appears **outside the main clause** (e.g., salutation, signature, or sender block), include it.

🧠 BONUS:
If there are additional specific entities like:
- Law firm names
- Case numbers
- Signature names
- Sender/Receiver/Client/Advocate names
then **extract those too**, and categorize them with best-fit types like Person, Law Firm, or Client.

TEXT:
{text}

Respond with JSON array only:
[
  {{"text": "Mr. Arjun Mehra", "type": "Person"}},
  {{"text": "Adv. R. K. Sharma", "type": "Person"}},
  {{"text": "Section 420 IPC", "type": "Legal Section"}}
]
"""
)






    def extract_clauses(self, document: str) -> List[str]:
        """Extract all clauses from document"""
        try:
            chain = self.clause_extraction_prompt | self.llm
            response = chain.invoke({"document": document})

            # Clean and parse JSON
            text = response.content.strip()
            if '```json' in text:
                text = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL).group(1)
            elif '```' in text:
                text = re.sub(r'```.*?```', '', text, flags=re.DOTALL).strip()

            # Extract JSON array
            json_match = re.search(r'\[.*\]', text, re.DOTALL)
            if json_match:
                clauses = json.loads(json_match.group(0))
                return [clause.strip() for clause in clauses if clause.strip()]

            return []

        except Exception as e:
            print(f"❌ Clause extraction error: {e}")
            return []

    def extract_entities(self, text: str) -> List[Dict[str, str]]:
        """Extract all named entities from text"""
        try:
            chain = self.ner_prompt | self.llm
            response = chain.invoke({"text": text})

            # Clean and parse JSON
            content = response.content.strip()
            if '```json' in content:
                content = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL).group(1)
            elif '```' in content:
                content = re.sub(r'```.*?```', '', content, flags=re.DOTALL).strip()

            # Extract JSON array
            json_match = re.search(r'\[.*\]', content, re.DOTALL)
            if json_match:
                entities = json.loads(json_match.group(0))
                return [ent for ent in entities if isinstance(ent, dict) and 'text' in ent and 'type' in ent]

            return []

        except Exception as e:
            print(f"❌ NER error: {e}")
            return []

    def process_document(self, document: str) -> Dict[str, Any]:
        """Main processing - only NER and clause extraction"""
        print("🔍 Extracting clauses...")
        clauses = self.extract_clauses(document)

        print(f"📝 Found {len(clauses)} clauses")
        print("🏷️ Extracting entities from each clause...")

        results = []
        all_entities = []

        for i, clause in enumerate(tqdm(clauses, desc="Processing clauses")):
            entities = self.extract_entities(clause)

            results.append({
                "clause_id": i + 1,
                "clause_text": clause,
                "entities": entities
            })

            # Collect all entities
            all_entities.extend(entities)

        # Remove duplicate entities
        unique_entities = []
        seen = set()
        for entity in all_entities:
            key = (entity['text'].lower(), entity['type'].lower())
            if key not in seen:
                unique_entities.append(entity)
                seen.add(key)

        return {
            "total_clauses": len(clauses),
            "total_entities": len(unique_entities),
            "clauses_with_entities": results,
            "all_unique_entities": unique_entities
        }

print("✅ PureNERClauseExtractor class created successfully!")

✅ PureNERClauseExtractor class created successfully!


In [4]:
def display_results(results: Dict[str, Any]):
    """Display only NER and clause extraction results"""
    print(f"\n{'='*80}")
    print(f"📋 NER + CLAUSE EXTRACTION RESULTS")
    print(f"{'='*80}")
    print(f"📊 Total Clauses: {results['total_clauses']}")
    print(f"🏷️ Total Unique Entities: {results['total_entities']}")

    print(f"\n📋 CLAUSES WITH ENTITIES:")
    print(f"{'─'*80}")

    for clause_data in results["clauses_with_entities"]:
        print(f"\n🔹 CLAUSE {clause_data['clause_id']}:")
        print(f"📝 Text: {clause_data['clause_text']}")

        entities = clause_data['entities']
        if entities:
            print(f"🏷️ Entities ({len(entities)}):")
            for entity in entities:
                print(f"   • {entity['text']} → {entity['type']}")
        else:
            print("🏷️ No entities found")

        print(f"{'─'*60}")

    print(f"\n🏷️ ALL UNIQUE ENTITIES:")
    print(f"{'─'*80}")

    # Group entities by type
    entity_groups = {}
    for entity in results["all_unique_entities"]:
        entity_type = entity['type']
        if entity_type not in entity_groups:
            entity_groups[entity_type] = []
        entity_groups[entity_type].append(entity['text'])

    for entity_type, texts in entity_groups.items():
        print(f"\n📌 {entity_type}:")
        for text in texts:
            print(f"   • {text}")

def save_results_to_json(results: Dict[str, Any], filename: str = "ner_clause_results.json"):
    """Save results to JSON file"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"💾 Results saved to {filename}")
    except Exception as e:
        print(f"❌ Failed to save results: {e}")

print("✅ Display and save functions created successfully!")

✅ Display and save functions created successfully!


In [5]:
def extract_ner_and_clauses(document_text: str, api_key: str = GOOGLE_API_KEY, save_to_file: bool = True):
    """Extract NER and clauses - nothing more"""
    try:
        print("🚀 Starting NER + Clause Extraction...")
        print("🤖 Initializing Gemini 2.0 Flash...")

        extractor = PureNERClauseExtractor(api_key)

        print("📄 Processing document...")
        results = extractor.process_document(document_text)

        print("📊 Displaying results...")
        display_results(results)

        if save_to_file:
            save_results_to_json(results)

        return results

    except Exception as e:
        print(f"❌ Extraction failed: {str(e)}")
        return None

print("✅ Main extraction function ready!")
print("🎯 Ready to process legal documents!")

✅ Main extraction function ready!
🎯 Ready to process legal documents!


In [6]:
YOUR_DOCUMENT = """
LEGAL NOTICE
Under Section 52 of the Transfer of Property Act, 1882
Read with Sections 441, 447, and 506 of the Indian Penal Code, 1860

From:
Advocate Meenal Roy
Chamber No. 6, Civil Court, Allahabad High Court,
Prayagraj, Uttar Pradesh – 211001
Contact: +91-9415001122
Email: meenalroy.adv@gmail.com

To:
Mrs. Renu Verma
W/o Mr. Ajay Verma
R/o 32-A, Shanti Kunj Colony,
Gomti Nagar, Lucknow – 226010

Date: 17th June 2025

Subject: Legal Notice for Illegal Encroachment and Criminal Trespass of Ancestral Property situated at Plot No. 86, Village Barabanki, U.P.

Dear Mrs. Renu Verma,

Under instructions from and on behalf of my client Mr. Manish Verma, S/o Late Mr. Surendra Verma, R/o 14, Sector D, Aliganj, Lucknow – 226024, I address you as follows:

That my client is the legal co-owner of ancestral property bearing Plot No. 86, Khasra No. 112/3, Village Raipur, Tehsil Nawabganj, District Barabanki, measuring approx. 4,800 sq. ft., inherited from his late father Shri Surendra Verma, through lawful succession.

That on or around 26th April 2025, you and your family members unlawfully entered the aforementioned land without any legal right, and constructed a boundary wall overnight, thereby committing criminal trespass.

That you were previously informed verbally and through Panchayat mediation that the land is under pending partition proceedings in the District Civil Court, Case No. 382/2024, and that no permanent construction can be made until its final adjudication.

That your actions amount to the following cognizable offences:

Section 441 IPC – Criminal Trespass

Section 447 IPC – Punishment for Criminal Trespass

Section 506 IPC – Criminal Intimidation (as you allegedly threatened my client when he questioned your illegal encroachment)

That my client reserves his right to initiate civil action for injunction and possession, and criminal action before the local police and magistrate for the above offences.

Therefore, through this legal notice, you are hereby directed to:

Vacate the said property and remove all illegal constructions within 7 days from the date of receipt of this notice.

Cease and desist from entering, threatening, or interfering with the said land in any manner.

Submit a written apology and settlement proposal if any, through your legal representative within the given time.

Failing which, my client shall be constrained to approach the Hon’ble Court and local police authorities seeking urgent reliefs at your sole cost and consequences.

A copy of this notice is being retained in my office for record.

Yours faithfully,
(Advocate Meenal Roy)
Legal Counsel for Mr. Manish Verma
"""

print("\n💡 TO PROCESS YOUR DOCUMENT:")
print("1. Replace YOUR_DOCUMENT variable with your legal text")
print("2. Run: results = extract_ner_and_clauses(YOUR_DOCUMENT)")
print("3. Results will be displayed and saved to JSON file")

# Uncomment the line below after adding your document
your_results = extract_ner_and_clauses(YOUR_DOCUMENT)



💡 TO PROCESS YOUR DOCUMENT:
1. Replace YOUR_DOCUMENT variable with your legal text
2. Run: results = extract_ner_and_clauses(YOUR_DOCUMENT)
3. Results will be displayed and saved to JSON file
🚀 Starting NER + Clause Extraction...
🤖 Initializing Gemini 2.0 Flash...
📄 Processing document...
🔍 Extracting clauses...
📝 Found 13 clauses
🏷️ Extracting entities from each clause...


Processing clauses: 100%|██████████| 13/13 [00:08<00:00,  1.54it/s]

📊 Displaying results...

📋 NER + CLAUSE EXTRACTION RESULTS
📊 Total Clauses: 13
🏷️ Total Unique Entities: 32

📋 CLAUSES WITH ENTITIES:
────────────────────────────────────────────────────────────────────────────────

🔹 CLAUSE 1:
📝 Text: Under Section 52 of the Transfer of Property Act, 1882
🏷️ Entities (2):
   • Section 52 → Legal Section
   • Transfer of Property Act, 1882 → Law or Act
────────────────────────────────────────────────────────────

🔹 CLAUSE 2:
📝 Text: Read with Sections 441, 447, and 506 of the Indian Penal Code, 1860
🏷️ Entities (2):
   • Sections 441, 447, and 506 → Legal Section
   • Indian Penal Code, 1860 → Law or Act
────────────────────────────────────────────────────────────

🔹 CLAUSE 3:
📝 Text: That my client is the legal co-owner of ancestral property bearing Plot No. 86, Khasra No. 112/3, Village Raipur, Tehsil Nawabganj, District Barabanki, measuring approx. 4,800 sq. ft., inherited from his late father Shri Surendra Verma, through lawful succession.
🏷️ Entit


