# Court Document Analyzer
This notebook implements a system for extracting structured information from Supreme Court judgment PDFs using advanced text extraction and pattern matching techniques.

## 1. Import Libraries
Import necessary libraries for PDF processing, data manipulation, and data modeling.

In [None]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tempfile
from typing import List, Dict, Optional, Any
from pydantic import BaseModel, Field
import json
from io import BytesIO
import time
import ipywidgets as widgets
from IPython.display import display, Markdown, HTML

# Set plot style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

## 2. Define Data Models
Creating Pydantic models to represent structured data from court documents.

In [None]:
# Define structured output models using Pydantic
class CaseNumber(BaseModel):
    """Information about a case number in the document"""
    type: str = Field(description="Type of case (e.g., Appeal, Civil Appeal)")
    nature: Optional[str] = Field(description="Nature of case (e.g., civil, criminal)")
    sequential_number: str = Field(description="Sequential number of the case")
    year: str = Field(description="Year of filing")
    full_citation: str = Field(description="Full citation as appears in document")

class Party(BaseModel):
    """Information about a party in the case"""
    name: str = Field(description="Name of the party")
    role: str = Field(description="Role of the party (e.g., Petitioner, Respondent)")
    description: Optional[str] = Field(description="Additional description if available")

class ConsolidatedCase(BaseModel):
    """Information about a consolidated case"""
    case_number: str = Field(description="Case number citation")
    petitioner: str = Field(description="Petitioner/appellant in the case")
    respondent: str = Field(description="Respondent in the case")

## 3. PDF Text Extraction
Implementing functions to extract text from PDF documents while preserving the document layout.

In [None]:
# Advanced text extraction from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF with advanced layout preservation"""
    doc = fitz.open(pdf_path)
    text = ""
    
    for page in doc:
        # Extract text with preservation of layout
        text += page.get_text("text")
    
    return text

# Function to display sample text extraction
def display_text_sample(text, max_length=1000):
    """Display a sample of the extracted text"""
    sample = text[:max_length] + "..." if len(text) > max_length else text
    print(f"Total text length: {len(text)} characters")
    print("\nSample text:")
    print("-" * 80)
    print(sample)
    print("-" * 80)

## 4. Case Information Extraction
Developing functions to extract various pieces of information from court documents using regex patterns.

In [None]:
# Extract case numbers using advanced regex patterns
def extract_case_numbers(text: str) -> Dict[str, Any]:
    """Extract primary and related case numbers"""
    case_numbers = {"primary": None, "related": []}
    
    # Patterns for different case number formats
    patterns = [
        r"Appeal\s+\(civil\)\s+(\d+)\s+of\s+(\d{4})",
        r"Civil\s+Appeal\s+No\.\s*(\d+(?:-\d+)?)\s+of\s+(\d{4})",
        r"Transfer\s+Case\s+\(Civil\)\s+Nos\.\s*(\d+(?:-\d+)?)\s+of\s+(\d{4})",
        r"Civil\s+Appeal\s+Nos\.\s*(\d+(?:-\d+)?)\s+of\s+(\d{4})"
    ]
    
    # Extract primary case number
    primary_match = re.search(patterns[0], text)
    if primary_match:
        case_numbers["primary"] = CaseNumber(
            type="Appeal",
            nature="civil",
            sequential_number=primary_match.group(1),
            year=primary_match.group(2),
            full_citation=f"Appeal (civil) {primary_match.group(1)} of {primary_match.group(2)}"
        )
    
    # Extract related case numbers
    for pattern in patterns:
        for match in re.finditer(pattern, text):
            if pattern == patterns[0] and primary_match and match.group(0) == primary_match.group(0):
                continue  # Skip primary case number in related cases
                
            case_type = "Civil Appeal" if "Civil Appeal" in match.group(0) else \
                       "Transfer Case (Civil)" if "Transfer Case" in match.group(0) else "Appeal"
            
            case_numbers["related"].append(CaseNumber(
                type=case_type,
                nature="civil",
                sequential_number=match.group(1),
                year=match.group(2),
                full_citation=match.group(0)
            ))
    
    return case_numbers

# Extract parties using specialized pattern matching
def extract_parties(text: str) -> Dict[str, Any]:
    """Extract parties information using specialized pattern matching"""
    parties = {"main": [], "consolidated": []}
    
    # Extract main parties
    petitioner_pattern = r"(?:PETITIONER|APPELLANT):\s*(.*?)(?=RESPONDENT:|$)"
    respondent_pattern = r"RESPONDENT:\s*(.*?)(?=DATE OF JUDGMENT:|$)"
    
    petitioner_match = re.search(petitioner_pattern, text, re.DOTALL | re.IGNORECASE)
    respondent_match = re.search(respondent_pattern, text, re.DOTALL | re.IGNORECASE)
    
    if petitioner_match:
        parties["main"].append(Party(
            name=petitioner_match.group(1).strip(),
            role="Petitioner",
            description=None
        ))
    
    if respondent_match:
        parties["main"].append(Party(
            name=respondent_match.group(1).strip(),
            role="Respondent",
            description=None
        ))
    
    # Extract consolidated cases parties using more sophisticated pattern matching
    consolidated_pattern = r"WITH\s+(.*?)(?=WITH|\n\n|\Z)"
    consolidated_sections = re.findall(consolidated_pattern, text, re.DOTALL)
    
    # Also look for versus pattern to identify parties
    versus_pattern = r"([^\n]+)\s+(?:Versus|vs\.)\s+([^\n]+)"
    for match in re.finditer(versus_pattern, text):
        if match:
            petitioner = match.group(1).strip()
            respondent = match.group(2).strip()
            
            # Only add if not already in main parties
            main_petitioners = [p.name for p in parties["main"] if p.role == "Petitioner"]
            main_respondents = [p.name for p in parties["main"] if p.role == "Respondent"]
            
            if petitioner not in main_petitioners and respondent not in main_respondents:
                parties["consolidated"].append(ConsolidatedCase(
                    case_number="Related Case",
                    petitioner=petitioner,
                    respondent=respondent
                ))
    
    return parties

# Extract judgment date
def extract_judgment_date(text: str) -> str:
    """Extract the judgment date from the document"""
    date_patterns = [
        r"DATE OF JUDGMENT:\s*(\d{2}/\d{2}/\d{4})",
        r"Dated:\s*(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+,\s+\d{4})",
        r"judgment delivered on\s*:\s*(\d{1,2}(?:st|nd|rd|th)?\s+[A-Za-z]+,\s+\d{4})"
    ]
    
    for pattern in date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    return "Date not found"

# Extract case background using advanced pattern matching
def extract_case_background(text: str) -> Dict[str, Any]:
    """Extract case background using advanced pattern matching"""
    # Initialize result
    result = {
        "case_background": "",
        "constitutional_issues": [],
        "challenged_acts": []
    }
    
    # Try multiple patterns for case background
    background_patterns = [
        r"The Constitutional validity of(.*?)(?:\n\d+\.|\Z)",
        r"JUDGMENT\s+\n+(.*?)(?=\n[A-Z\s]+:|\Z)",
        r"(?:INTRODUCTION|BACKGROUND|FACTS)\s*\n+(.*?)(?=\n[A-Z\s]+:|\Z)"
    ]
    
    for pattern in background_patterns:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            result["case_background"] = match.group(1).strip()
            break
    
    # If still no match, try to extract at least some context
    if not result["case_background"]:
        # Try to extract the first significant paragraph after the case details
        header_section = re.search(r"JUDGMENT.*?\n+(.*?)(?=\n\n|\Z)", text, re.DOTALL)
        if header_section:
            result["case_background"] = header_section.group(1).strip()
    
    # Extract challenged acts
    act_pattern = r"((?:The\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+Act,\s+\d{4})"
    challenged_acts = re.findall(act_pattern, result["case_background"])
    result["challenged_acts"] = challenged_acts
    
    # Extract constitutional issues
    if "constitutional validity" in text.lower():
        result["constitutional_issues"].append("Constitutional validity of state legislation")
    if "jurisdiction" in text.lower() and "high court" in text.lower():
        result["constitutional_issues"].append("Jurisdiction of High Courts")
    
    return result

## 5. Complete Extraction and Formatting Functions
Combining all extraction functions into a single workflow and formatting the results.

In [None]:
# Main extraction function
def extract_court_case_info(pdf_path: str) -> Dict[str, Any]:
    """Extract all required information from a court case PDF"""
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    # Extract components
    case_numbers = extract_case_numbers(text)
    parties = extract_parties(text)
    judgment_date = extract_judgment_date(text)
    background_info = extract_case_background(text)
    
    # Compile results
    result = {
        "primary_case_number": case_numbers["primary"],
        "related_case_numbers": case_numbers["related"],
        "main_parties": parties["main"],
        "consolidated_cases": parties["consolidated"],
        "judgment_date": judgment_date,
        "case_background": background_info["case_background"],
        "constitutional_issues": background_info["constitutional_issues"],
        "challenged_acts": background_info["challenged_acts"]
    }
    
    return result

# Format output to match the desired structure
def format_output(extraction_result: Dict[str, Any]) -> str:
    """Format the extraction results to match the desired output structure"""
    output = []
    
    # Case Numbers section
    output.append("## Case Numbers")
    if extraction_result["primary_case_number"]:
        primary = extraction_result["primary_case_number"]
        output.append(f"The primary case number is formatted as \"{primary.full_citation}\". This format includes:")
        output.append(f"- Type of case ({primary.type})")
        output.append(f"- Nature ({primary.nature})")
        output.append(f"- Sequential number ({primary.sequential_number})")
        output.append(f"- Year of filing ({primary.year})")
        output.append("")
    
    if extraction_result["related_case_numbers"]:
        output.append("The document also includes multiple related case numbers that were heard together:")
        for case in extraction_result["related_case_numbers"]:
            output.append(f"- {case.full_citation}")
        output.append("")
    
    # Parties Names section
    output.append("## Parties Names")
    output.append("The document clearly identifies the parties to the litigation:")
    output.append("")
    output.append("**Main Case:**")
    for party in extraction_result["main_parties"]:
        output.append(f"- {party.role}: {party.name}")
    output.append("")
    
    if extraction_result["consolidated_cases"]:
        output.append("**Consolidated Cases with Multiple Parties:**")
        for case in extraction_result["consolidated_cases"]:
            output.append(f"- {case.petitioner} vs. {case.respondent}")
        output.append("")
    
    # Hearing Dates section
    output.append("## Hearing Dates")
    output.append(f"The judgment date is clearly marked as \"{extraction_result['judgment_date']}\". This represents the final date when the judgment was delivered, not necessarily when hearings took place.")
    output.append("")
    
    # Case Background section
    output.append("## Case Background")
    if extraction_result["challenged_acts"]:
        output.append("The case involves constitutional challenges to several legislative acts:")
        for act in extraction_result["challenged_acts"]:
            output.append(f"- {act}")
        output.append("")
    
    output.append(extraction_result["case_background"])
    output.append("")
    
    if extraction_result["constitutional_issues"]:
        output.append("The case fundamentally concerns the following constitutional issues:")
        for issue in extraction_result["constitutional_issues"]:
            output.append(f"- {issue}")
    
    return "\n".join(output)

# Convert Pydantic models to a dict that can be serialized to JSON
def pydantic_to_dict(obj):
    if hasattr(obj, "__dict__"):
        return {k: pydantic_to_dict(v) for k, v in obj.__dict__.items() if k != "__initialised__"}
    elif isinstance(obj, dict):
        return {k: pydantic_to_dict(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [pydantic_to_dict(v) for v in obj]
    else:
        return obj

## 6. Visualize Results
Creating visualizations to better understand the extracted data.

In [None]:
def visualize_case_data(extraction_result: Dict[str, Any], title: str = "Case Analysis"):
    """Create visualizations for the extracted court case data"""
    # Create a figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
    fig.suptitle(title, fontsize=16)
    
    # Plot 1: Party information (Left plot)
    parties_data = []
    
    # Main parties
    for party in extraction_result["main_parties"]:
        parties_data.append({"Name": party.name.split()[0] if len(party.name.split()) > 0 else "Unknown", 
                            "Type": party.role, 
                            "Category": "Main Case"})
    
    # Consolidated cases
    for case in extraction_result["consolidated_cases"]:
        parties_data.append({"Name": case.petitioner.split()[0] if len(case.petitioner.split()) > 0 else "Unknown", 
                            "Type": "Petitioner", 
                            "Category": "Consolidated"})
        parties_data.append({"Name": case.respondent.split()[0] if len(case.respondent.split()) > 0 else "Unknown", 
                            "Type": "Respondent", 
                            "Category": "Consolidated"})
    
    # Convert to DataFrame
    if parties_data:
        parties_df = pd.DataFrame(parties_data)
        
        # Create grouped bar chart
        sns.countplot(x="Type", hue="Category", data=parties_df, ax=ax1)
        ax1.set_title("Parties by Role and Case Type")
        ax1.set_xlabel("Party Role")
        ax1.set_ylabel("Count")
    else:
        ax1.text(0.5, 0.5, "No party data available", ha='center', va='center')
    
    # Plot 2: Case timeline / date information (Right plot)
    if extraction_result["primary_case_number"]:
        years = []
        
        # Add primary case year
        if extraction_result["primary_case_number"]:
            years.append(int(extraction_result["primary_case_number"].year))
        
        # Add related case years
        for case in extraction_result["related_case_numbers"]:
            years.append(int(case.year))
        
        # Create histogram of case years
        if years:
            ax2.hist(years, bins=len(set(years)), alpha=0.7, color='green')
            ax2.set_title("Distribution of Case Filing Years")
            ax2.set_xlabel("Year")
            ax2.set_ylabel("Number of Cases")
            
            # Add judgment year if available
            judgment_year = None
            if extraction_result["judgment_date"] not in ["Date not found", ""]:
                # Try to extract year from judgment date
                year_match = re.search(r"\b(19|20)\d{2}\b", extraction_result["judgment_date"])
                if year_match:
                    judgment_year = int(year_match.group(0))
                    ax2.axvline(x=judgment_year, color='red', linestyle='--', label=f'Judgment ({judgment_year})')
                    ax2.legend()
        else:
            ax2.text(0.5, 0.5, "No year data available", ha='center', va='center')
    else:
        ax2.text(0.5, 0.5, "No case number data available", ha='center', va='center')
    
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

## 7. Process Sample Documents
Processing example court documents and displaying the results.

In [None]:
# This cell would typically load and process a sample PDF document
# Since we don't have actual PDF documents here, we'll create a simulated example

def simulate_sample_document():
    """Simulate a sample court document for demonstration purposes"""
    sample_text = """
    IN THE SUPREME COURT OF INDIA
    CIVIL APPELLATE JURISDICTION
    
    Appeal (civil) 6756 of 2004
    
    WITH
    Civil Appeal No. 6757 of 2004
    Civil Appeal No. 6758 of 2004
    
    PETITIONER:
    State of Maharashtra & Ors.
    
    RESPONDENT:
    Dr. Praful B. Desai & Anr.
    
    DATE OF JUDGMENT: 14/04/2005
    
    JUDGMENT
    
    The Constitutional validity of Section 45 of the Evidence Act, as amended by Act 21 of 2000 is under challenge in this Writ Petition.
    
    This case fundamentally concerns the admissibility of evidence recorded through video-conferencing. The High Court of Bombay 
    has held that evidence can be recorded by video-conferencing. This judgment is under challenge.
    
    The Maharashtra Medical Council Act, 1965 and the Indian Medical Council Act, 1956 both provide for regulation of medical practice in India.
    """
    
    return sample_text

# Process the simulated document
def process_simulated_document():
    """Process a simulated document and display results"""
    sample_text = simulate_sample_document()
    
    print("Processing simulated Supreme Court document...\n")
    
    # Extract all information using our extraction functions
    case_numbers = extract_case_numbers(sample_text)
    parties = extract_parties(sample_text)
    judgment_date = extract_judgment_date(sample_text)
    background_info = extract_case_background(sample_text)
    
    # Compile results
    extraction_result = {
        "primary_case_number": case_numbers["primary"],
        "related_case_numbers": case_numbers["related"],
        "main_parties": parties["main"],
        "consolidated_cases": parties["consolidated"],
        "judgment_date": judgment_date,
        "case_background": background_info["case_background"],
        "constitutional_issues": background_info["constitutional_issues"],
        "challenged_acts": background_info["challenged_acts"]
    }
    
    # Format the output
    formatted_output = format_output(extraction_result)
    
    # Display results
    display(Markdown("## Extracted Information"))
    display(Markdown(formatted_output))
    
    # Visualize results
    visualize_case_data(extraction_result, "Sample Document Analysis")
    
    return extraction_result

# Run the simulated document process
sample_result = process_simulated_document()

## 8. Export Results
Implementing functions to export the structured data to various formats.

In [None]:
def export_to_json(extraction_result, filename="court_case_data.json"):
    """Export the extraction results to a JSON file"""
    # Convert Pydantic models to dictionaries
    json_data = pydantic_to_dict(extraction_result)
    
    # Convert to JSON string with pretty formatting
    json_str = json.dumps(json_data, indent=2)
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json_str)
    
    print(f"Data exported to {filename}")
    return json_str

def export_to_excel(extraction_result, filename="court_case_data.xlsx"):
    """Export the extraction results to an Excel file"""
    # Create a dictionary to hold the data
    data = {
        "Filename": ["Sample Document"],
        "Primary Case Number": [extraction_result["primary_case_number"].full_citation if extraction_result["primary_case_number"] else ""],
        "Related Cases": [", ".join([case.full_citation for case in extraction_result["related_case_numbers"]]) if extraction_result["related_case_numbers"] else ""],
        "Petitioners": [", ".join([party.name for party in extraction_result["main_parties"] if party.role == "Petitioner"])],
        "Respondents": [", ".join([party.name for party in extraction_result["main_parties"] if party.role == "Respondent"])],
        "Judgment Date": [extraction_result["judgment_date"]],
        "Case Background": [extraction_result["case_background"][:500] + "..." if len(extraction_result["case_background"]) > 500 else extraction_result["case_background"]]
    }
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Save to Excel
    df.to_excel(filename, index=False)
    
    print(f"Data exported to {filename}")
    return df

def export_to_markdown(formatted_output, filename="court_case_report.md"):
    """Export the formatted output to a Markdown file"""
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(formatted_output)
    
    print(f"Report exported to {filename}")
    return formatted_output

# Example export of our sample data
print("Exporting sample data to various formats:")
json_output = export_to_json(sample_result, "sample_court_case.json")
excel_output = export_to_excel(sample_result, "sample_court_case.xlsx")
markdown_output = export_to_markdown(format_output(sample_result), "sample_court_case.md")

## 9. Conclusion
This notebook has demonstrated a complete workflow for analyzing court documents. The techniques include:

1. Text extraction from PDF documents
2. Structured data extraction using regex patterns
3. Modeling the data using Pydantic classes
4. Visualizing the extracted information
5. Exporting to various formats for further analysis

The system can be extended to handle more document types or extract additional information by adding new extraction functions and data models.