In [3]:
import pandas as pd
data=pd.read_csv("legal_summaries.csv")

In [5]:
import spacy
import re
from collections import defaultdict
import json

def clean_text(text):
    """Clean the input text by removing excessive whitespace and escape characters"""
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\t+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_legal_entities(text):
    """Extract named entities from legal text with focus on precise section references"""
    # Clean the text first
    text = clean_text(text)
    
    nlp = spacy.load("en_core_web_lg")
    
    # Process the text
    doc = nlp(text)
    
    # Initialize result containers
    entities = {
        "Case_Numbers": set(),
        "Judge_Names": set(),
        "Section_References": set(),
        "Acts_Mentioned": set(),
        "Precedents": set(),
    }
    
    # Enhanced judge name detection - specific patterns for legal documents
    judge_patterns = [
        # Patterns with designations
        re.compile(r"(?:Justice|Judge|J\.|Hon'ble|Chief Justice|C\.J\.|Justices)\s+([A-Z][a-z]+(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+)?)"),
        # Names followed by J. or Justice
        re.compile(r"([A-Z][a-z]+(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+)?),?\s+(?:J\.|Justice|C\.J\.)"),
        # Common Indian judge naming patterns
        re.compile(r"([A-Z][a-z]+\s+[A-Z][a-z]+)(?:\s+and|\s+with|\s+delivered|\s+observed|\s+held)"),
        # Indian naming pattern with initials
        re.compile(r"(?:Justice|Judge|J\.|Hon'ble)\s+([A-Z]\.\s*[A-Z]\.\s*[A-Z][a-z]+)"),
    ]
    
    for pattern in judge_patterns:
        for match in pattern.finditer(text):
            # Get either the judge name or the first group if it exists
            judge_name = match.group(1) if match.groups() else match.group(0)
            entities["Judge_Names"].add(judge_name.strip())
    
    # Filter out non-judge entries from Judge_Names
    non_judges = {"Municipal Act", "This", "Reliance", "Privy Council"}
    entities["Judge_Names"] = set(name for name in entities["Judge_Names"] if name not in non_judges)
    
    # Case number detection focusing on assessment years
    case_number_patterns = [
        re.compile(r"assessment\s+year\s+\d{4}\s*[\-–]\s*\d{2}")
    ]
    
    for pattern in case_number_patterns:
        for match in pattern.finditer(text):
            entities["Case_Numbers"].add(match.group(0).strip())
    
    # Improved section reference extraction - stop at 'Act'
    section_pattern = re.compile(r"[Ss]ection\s+\d+(?:\([a-z]\))?\s+of\s+[A-Za-z\s]+?Act")
    for match in section_pattern.finditer(text):
        section_ref = match.group(0)
        entities["Section_References"].add(section_ref)
    
    # Rule-based extraction for Acts mentioned - more precise
    # Improved act extraction regex: allows up to three words before "Act"
    act_pattern = re.compile(r"\b(?:[A-Z][a-z]+\s+){1,3}Act(?:\s+of\s+\d{4})?")
    # Extract valid acts
    for match in act_pattern.finditer(text):
        act_name = match.group(0).strip()
        if act_name.lower() not in {"that act", "this act", "tax act"}:
            entities["Acts_Mentioned"].add(act_name)



   
    # Extraction for precedents with proper format
    precedent_pattern = re.compile(r"([A-Za-z\s]+)\s+(?:v\.|vs\.?|versus)\s+([A-Za-z\s]+)")
    for match in precedent_pattern.finditer(text):
        if match.groups():
            first_party = match.group(1).strip()
            second_party = match.group(2).strip()
            if "Commissioner of Income" in second_party:
                # Handle special case for tax cases
                second_party = "Commissioner of Income tax"
            precedent = f"{first_party} vs {second_party}"
            entities["Precedents"].add(precedent)
    
    # Convert sets to sorted lists for JSON serialization
    result = {k: sorted(list(v)) for k, v in entities.items()}
    
    return result

def analyze_legal_text(text):
    """Analyze a legal text string and extract relevant entities"""
    entities = extract_legal_entities(text)
    return entities

# Example usage
if __name__ == "__main__":
    sample_text =data["input_text"][7800]
    result = analyze_legal_text(sample_text)
    print(json.dumps(result, indent=2))

{
  "Case_Numbers": [],
  "Judge_Names": [
    "Altimo Holdings",
    "Berzon",
    "Between October",
    "Birss",
    "Competition Law",
    "Curiel",
    "European Social",
    "He",
    "International Ltd",
    "Judge Robart",
    "Kennedy",
    "Licensing Declaration",
    "Lord Kitchin",
    "Microsoft Inc",
    "Rader",
    "Reyna",
    "Robart",
    "Robarts",
    "South Korea",
    "Supreme Court",
    "Trade Commission",
    "Unwired Planet"
  ],
  "Section_References": [
    "section 50 of the Senior Courts Act"
  ],
  "Acts_Mentioned": [
    "English Companies Act",
    "Lord Cairnss Act",
    "Senior Courts Act"
  ],
  "Precedents": []
}


In [None]:
data["input_text"][0]

'Appeal No. LXVI of 1949.\nAppeal from the High Court of judicature, Bombay, in a reference under section 66 of the Indian Income tax Act, 1022.\nK.M. Munshi (N. P. Nathvani, with him), for the appel lant. \' M.C. Setalvad, Attorney General for India (H. J. Umrigar, with him), for the respondent. 1950.\nMay 26.\nThe judgment of the Court was delivered by MEHR CHAND MAHAJAN J.\nThis is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 (1) (iv) of the Indian Income tax Act.\nThe assessee company is an investment company deriving its income from properties in the city of Bombay.\nFor the assessment year 1940 41 the net income of the assessee under the head "property" was computed by the Income tax Officer in the sum of Rs. 6,21,764 after deducting from gross rents certain pa