## Best One

The code extracts case data from a DOCX file by combining style analysis and text pattern recognition. It uses styles to identify "Part" and "Topic" headings, then extracts key-value pairs (like "Name of the case") using a combination of line-based extraction and bold text analysis with regex matching. Data is merged from different extraction methods and refined to ensure accuracy and consistency.

Key Insights (Short)

A combined approach (styles + regex) is essential.
Line-based key extraction improves accuracy.
Handling variations in formatting is necessary.
Iterative refinement is crucial for high accuracy.
Data cleaning and post-processing are vital.








In [5]:
import docx
import re
import json
from collections import defaultdict

def clean_text(text):
    """Cleans up text by removing extra whitespace and formatting."""
    if not text:
        return None
    # Remove asterisks that indicate bold formatting
    text = text.replace("**", "")
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_case_data(docx_file):
    """
    Extracts case data from DOCX file into simplified flat structure.
    Each case will have a Part field indicating which constitutional part it belongs to.
    """
    doc = docx.Document(docx_file)
    
    # Get full text first to help with regex matching
    full_text = "\n".join([para.text for para in doc.paragraphs])
    
    # Extract parts structure to map topics to parts
    parts_map = {}
    current_part = None
    part_pattern = re.compile(r'Part\s+([IVX]+)\s+(.*?)(?:\(Article\s+(\d+)\s*[–-]\s*(\d+)\))?')
    
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
            
        part_match = part_pattern.search(text)
        if part_match:
            current_part = {
                "number": part_match.group(1),
                "name": part_match.group(2).strip(),
                "full_name": f"Part {part_match.group(1)} {part_match.group(2).strip()}"
            }
            
            if part_match.group(3) and part_match.group(4):
                current_part["article_range"] = f"{part_match.group(3)} - {part_match.group(4)}"
        
        # Check if this is a topic
        if text.startswith("Topic:") and current_part:
            topic_name = text[6:].strip()
            parts_map[topic_name] = current_part
    
    # Now extract cases using regex approach - more reliable for fragmented content
    cases = []
    
    # Split by Topic markers
    topic_pattern = re.compile(r"Topic:\s*(.*?)(?=Topic:|$)", re.DOTALL)
    topic_blocks = topic_pattern.findall(full_text)
    
    for block in topic_blocks:
        if not block.strip():
            continue
            
        # Initialize a case dictionary with all fields
        case = {
            "Topic": None,
            "Part": None,
            "Part_Name": None,
            "Name of the case": None,
            "Issue": None,
            "Bench": None,
            "Fact of the Case": None,
            "Ratio": None,
            "Judgment": None,
            "case_details": None  # New field for uncategorized content
        }
        
        # Extract topic name (everything before the first line break or special marker)
        topic_name_match = re.match(r"(.*?)(?=\n|\*\*|Name of the case:|Issue:|Bench:|Fact of the Case:|Ratio:|Judgment:)", block, re.DOTALL)
        if topic_name_match:
            topic = clean_text(topic_name_match.group(1))
            case["Topic"] = topic
            
            # Try to find which part this topic belongs to
            for topic_key, part_info in parts_map.items():
                if topic and topic_key in topic:
                    case["Part"] = part_info["number"]
                    case["Part_Name"] = part_info["name"]
                    break
        
        # Extract case name
        case_name_match = re.search(r"\*\*Name of the case:\s*(.*?)\*\*(?=\*\*|\n|Issue:|Bench:|Fact of the Case:|Ratio:|Judgment:|$)", block, re.DOTALL)
        if case_name_match:
            case["Name of the case"] = clean_text(case_name_match.group(1))
        else:
            # Try alternate pattern with colon
            case_name_match = re.search(r"Name of the case:\s*(.*?)(?=\*\*|\n|Issue:|Bench:|Fact of the Case:|Ratio:|Judgment:|$)", block, re.DOTALL)
            if case_name_match:
                case["Name of the case"] = clean_text(case_name_match.group(1))
            else:
                # Try to extract from topic text if it contains "v." pattern
                if case["Topic"]:
                    case_in_topic = re.search(r"(.*?)\s*v\.\s*(.*?)(?=,|\(|$)", case["Topic"])
                    if case_in_topic:
                        case["Name of the case"] = f"{case_in_topic.group(1).strip()} v. {case_in_topic.group(2).strip()}"
                    else:
                        # Look for AIR citation pattern which often follows case name
                        air_match = re.search(r"(.*?)(?:,\s*AIR\s*\d{4}|$)", case["Topic"])
                        if air_match:
                            case["Name of the case"] = clean_text(air_match.group(1))
        
        # Extract issue - capturing multiline content
        issue_match = re.search(r"\*\*Issue:\*\*\s*(.*?)(?=\*\*Bench|\*\*Fact of the Case|\*\*Ratio|\*\*Judgment|\n\n|$)", block, re.DOTALL)
        if issue_match:
            case["Issue"] = clean_text(issue_match.group(1))
        else:
            issue_match = re.search(r"Issue:\s*(.*?)(?=Bench:|Fact of the Case:|Ratio:|Judgment:|\n\n|$)", block, re.DOTALL)
            if issue_match:
                case["Issue"] = clean_text(issue_match.group(1))
        
        # Extract bench - capturing multiline content
        bench_match = re.search(r"\*\*Bench\*\*\s*(.*?)(?=\*\*Fact of the Case|\*\*Ratio|\*\*Judgment|\*\*Issue|\n\n|$)", block, re.DOTALL)
        if not bench_match:
            bench_match = re.search(r"\*\*Bench:\*\*\s*(.*?)(?=\*\*Fact of the Case|\*\*Ratio|\*\*Judgment|\*\*Issue|\n\n|$)", block, re.DOTALL)
        if not bench_match:
            bench_match = re.search(r"Bench:\s*(.*?)(?=Fact of the Case:|Ratio:|Judgment:|Issue:|\n\n|$)", block, re.DOTALL)
        if bench_match:
            case["Bench"] = clean_text(bench_match.group(1))
        
        # Extract facts - capturing multiline content
        facts_match = re.search(r"\*\*Fact of the Case:\s*\*\*(.*?)(?=\*\*Ratio|\*\*Judgment|\*\*Issue|\*\*Bench|\n\n|$)", block, re.DOTALL)
        if not facts_match:
            facts_match = re.search(r"\*\*Fact of the Case:\*\*\s*(.*?)(?=\*\*Ratio|\*\*Judgment|\*\*Issue|\*\*Bench|\n\n|$)", block, re.DOTALL)
        if not facts_match:
            facts_match = re.search(r"Fact of the Case:\s*(.*?)(?=Ratio:|Judgment:|Issue:|Bench:|\n\n|$)", block, re.DOTALL)
        if facts_match:
            case["Fact of the Case"] = clean_text(facts_match.group(1))
        
        # Extract ratio - capturing multiline content
        ratio_match = re.search(r"\*\*Ratio:\*\*\s*(.*?)(?=\*\*Judgment|\*\*Issue|\*\*Bench|\*\*Fact|\n\n|$)", block, re.DOTALL)
        if not ratio_match:
            ratio_match = re.search(r"Ratio:\s*(.*?)(?=Judgment:|Issue:|Bench:|Fact of the Case:|\n\n|$)", block, re.DOTALL)
        if ratio_match:
            case["Ratio"] = clean_text(ratio_match.group(1))
        
        # Extract judgment - capturing multiline content
        judgment_match = re.search(r"\*\*Judgment:\s*\*\*(.*?)(?=\*\*Issue|\*\*Bench|\*\*Fact|\*\*Ratio|\n\n|$)", block, re.DOTALL)
        if not judgment_match:
            judgment_match = re.search(r"\*\*Judgment:\*\*\s*(.*?)(?=\*\*Issue|\*\*Bench|\*\*Fact|\*\*Ratio|\n\n|$)", block, re.DOTALL)
        if not judgment_match:
            judgment_match = re.search(r"Judgment:\s*(.*?)(?=Topic:|Issue:|Bench:|Fact of the Case:|Ratio:|\n\n|$)", block, re.DOTALL)
        if judgment_match:
            case["Judgment"] = clean_text(judgment_match.group(1))
        
        # Find any uncategorized content
        categorized_text = ""
        if case["Topic"]:
            categorized_text += case["Topic"]
        if case["Name of the case"]:
            categorized_text += " " + case["Name of the case"]
        if case["Issue"]:
            categorized_text += " " + case["Issue"]
        if case["Bench"]:
            categorized_text += " " + case["Bench"]
        if case["Fact of the Case"]:
            categorized_text += " " + case["Fact of the Case"]
        if case["Ratio"]:
            categorized_text += " " + case["Ratio"]
        if case["Judgment"]:
            categorized_text += " " + case["Judgment"]
            
        # Clean up the block and see if there's any uncategorized content
        clean_block = clean_text(block)
        if clean_block and clean_block != categorized_text:
            # Find text that isn't in any of the categorized fields
            uncategorized = clean_block
            
            # Remove categorized content - this is approximate
            for field in ["Name of the case", "Issue", "Bench", "Fact of the Case", "Ratio", "Judgment"]:
                if case[field]:
                    uncategorized = uncategorized.replace(case[field], "")
            
            # Clean up again and store if anything remains
            uncategorized = clean_text(uncategorized)
            if uncategorized and uncategorized != case["Topic"]:
                case["case_details"] = uncategorized
        
        # Include the case if it has at least topic information
        if case["Topic"]:
            cases.append(case)
    
    return cases

def extract_case_data_with_styles(docx_file):
    """
    Extracts case data using paragraph styles and formatting.
    Preserves multiline text for each field.
    """
    doc = docx.Document(docx_file)
    
    cases = []
    current_case = None
    current_part = None
    current_field = None
    
    for para in doc.paragraphs:
        text = para.text.strip()
        if not text:
            continue
        
        # Check if paragraph is a Part header
        part_match = re.match(r'Part\s+([IVX]+)\s+(.*?)(?:\(Article\s+(\d+)\s*[–-]\s*(\d+)\))?', text)
        if part_match or (hasattr(para, 'style') and para.style.name == 'Heading 1'):
            if part_match:
                current_part = {
                    "number": part_match.group(1),
                    "name": part_match.group(2).strip() if part_match.group(2) else text,
                }
            else:
                # Try to extract part info from text when style is Heading 1
                alt_match = re.search(r'Part\s+([IVX]+)\s+(.*?)(?:\(Article|$)', text)
                if alt_match:
                    current_part = {
                        "number": alt_match.group(1),
                        "name": alt_match.group(2).strip(),
                    }
                else:
                    current_part = {
                        "number": "Unknown",
                        "name": text,
                    }
            
            # Add the current case if it exists
            if current_case and current_case.get("Topic"):
                cases.append(current_case)
                current_case = None
        
        # Check if paragraph is a Topic
        elif text.startswith("Topic:") or (hasattr(para, 'style') and para.style.name == 'Heading 2'):
            # Save the current case if it exists
            if current_case and current_case.get("Topic"):
                cases.append(current_case)
            
            # Start a new case
            current_case = {
                "Topic": text[6:].strip() if text.startswith("Topic:") else text,
                "Part": current_part["number"] if current_part else None,
                "Part_Name": current_part["name"] if current_part else None,
                "Name of the case": None,
                "Issue": None,
                "Bench": None,
                "Fact of the Case": None,
                "Ratio": None,
                "Judgment": None,
                "case_details": None
            }
            current_field = None
        
        # Check for other fields using bold formatting
        elif current_case:
            has_bold = False
            bold_text = ""
            
            # Identify bold runs which might indicate field names
            for run in para.runs:
                if run.bold:
                    has_bold = True
                    bold_text += run.text
            
            # Check if the bold text matches known field names
            if has_bold:
                field_mapping = {
                    "Name of the case:": "Name of the case",
                    "Issue:": "Issue",
                    "Bench:": "Bench",
                    "Fact of the Case:": "Fact of the Case",
                    "Ratio:": "Ratio",
                    "Judgment:": "Judgment"
                }
                
                new_field_found = False
                for field_name, field_key in field_mapping.items():
                    if bold_text.startswith(field_name) or field_name in text:
                        current_field = field_key
                        new_field_found = True
                        # Extract value - remove the field name
                        value = text.replace(bold_text, "").strip()
                        if not value:  # If we couldn't extract it, try removing just the field name
                            value = text.replace(field_name, "").strip()
                        
                        if value:
                            current_case[current_field] = value
                        break
                
                # If we didn't match a new field but have bold text in the current paragraph
                if not new_field_found and current_field:
                    # It's probably continuation of the current field with some bold text
                    if current_case[current_field]:
                        current_case[current_field] += " " + text
                    else:
                        current_case[current_field] = text
            
            # If not a field but we have a current field, append to it
            elif current_field:
                if current_case[current_field]:
                    current_case[current_field] += " " + text
                else:
                    current_case[current_field] = text
            
            # If we don't have a current field but have text, it might be part of Topic or uncategorized content
            elif not current_field:
                if current_case["case_details"]:
                    current_case["case_details"] += " " + text
                else:
                    current_case["case_details"] = text
    
    # Add the last case if it exists
    if current_case and current_case.get("Topic"):
        cases.append(current_case)
    
    return cases

def merge_case_data(regex_cases, style_cases):
    """
    Merges cases extracted using both regex and style-based approaches.
    Preserves all content by adding to case_details if it doesn't fit elsewhere.
    """
    merged_cases = []
    
    # First, try to match cases between the two approaches
    for regex_case in regex_cases:
        matched = False
        for style_case in style_cases:
            # Check if they're likely the same case
            if regex_case["Topic"] and style_case["Topic"] and \
               (regex_case["Topic"] in style_case["Topic"] or style_case["Topic"] in regex_case["Topic"]):
                matched = True
                # Create a new case with the best information from both
                merged_case = {
                    "Topic": regex_case["Topic"] if len(regex_case["Topic"] or "") > len(style_case["Topic"] or "") else style_case["Topic"],
                    "Part": regex_case["Part"] or style_case["Part"],
                    "Part_Name": regex_case["Part_Name"] or style_case["Part_Name"],
                    "Name of the case": regex_case["Name of the case"] or style_case["Name of the case"],
                    "Issue": regex_case["Issue"] or style_case["Issue"],
                    "Bench": regex_case["Bench"] or style_case["Bench"],
                    "Fact of the Case": regex_case["Fact of the Case"] or style_case["Fact of the Case"],
                    "Ratio": regex_case["Ratio"] or style_case["Ratio"],
                    "Judgment": regex_case["Judgment"] or style_case["Judgment"],
                    "case_details": None
                }
                
                # Combine case_details from both methods
                details = []
                if regex_case["case_details"]:
                    details.append(regex_case["case_details"])
                if style_case["case_details"]:
                    details.append(style_case["case_details"])
                
                if details:
                    merged_case["case_details"] = " ".join(details)
                
                merged_cases.append(merged_case)
                break
        
        if not matched:
            merged_cases.append(regex_case)
    
    # Add any style cases that weren't matched
    for style_case in style_cases:
        matched = False
        for merged_case in merged_cases:
            if style_case["Topic"] and merged_case["Topic"] and \
               (style_case["Topic"] in merged_case["Topic"] or merged_case["Topic"] in style_case["Topic"]):
                matched = True
                break
        
        if not matched:
            merged_cases.append(style_case)
    
    # Post-processing: attempt to extract case names from topics if still missing
    for case in merged_cases:
        if not case["Name of the case"] and case["Topic"]:
            # Look for name patterns in the topic
            name_match = re.search(r"([^,]+)\s+v\.\s+([^,]+)", case["Topic"])
            if name_match:
                case["Name of the case"] = f"{name_match.group(1).strip()} v. {name_match.group(2).strip()}"
            else:
                # Look for citations which often follow case names
                citation_match = re.search(r"([^,]+),\s*(?:AIR|SC|SCC)\s+\d{4}", case["Topic"])
                if citation_match:
                    case["Name of the case"] = citation_match.group(1).strip()
    
    return merged_cases

def extract_and_merge(docx_file):
    """
    Main function to extract and merge case data using both approaches.
    Ensures all content is preserved by adding to case_details if necessary.
    """
    # Get cases using regex approach
    regex_cases = extract_case_data(docx_file)
    
    # Get cases using style-based approach
    style_cases = extract_case_data_with_styles(docx_file)
    
    # Merge the results
    merged_cases = merge_case_data(regex_cases, style_cases)
    
    # Final cleanup and formatting
    for case in merged_cases:
        # Clean up any remaining formatting
        for key in case:
            if case[key]:
                case[key] = clean_text(case[key])
        
        # If Topic contains embedded fields, extract them
        if case["Topic"] and any(field in case["Topic"] for field in [
            "Name of the case:", "Issue:", "Bench:", "Fact of the Case:", "Ratio:", "Judgment:"
        ]):
            # Extract each field from the topic
            for field_name, field_key in {
                "Name of the case:": "Name of the case",
                "Issue:": "Issue",
                "Bench:": "Bench",
                "Fact of the Case:": "Fact of the Case",
                "Ratio:": "Ratio",
                "Judgment:": "Judgment"
            }.items():
                if field_name in case["Topic"] and not case[field_key]:
                    field_pattern = re.compile(f"{field_name}(.*?)(?=Name of the case:|Issue:|Bench:|Fact of the Case:|Ratio:|Judgment:|$)", re.DOTALL)
                    match = field_pattern.search(case["Topic"])
                    if match:
                        case[field_key] = clean_text(match.group(1))
            
            # Clean up the topic by removing extracted fields
            for field in ["Name of the case:", "Issue:", "Bench:", "Fact of the Case:", "Ratio:", "Judgment:"]:
                if field in case["Topic"]:
                    topic_parts = case["Topic"].split(field)
                    case["Topic"] = clean_text(topic_parts[0])
                    break
        
        # Check if we have any duplicate content between fields and case_details
        if case["case_details"]:
            for field_key in ["Topic", "Name of the case", "Issue", "Bench", "Fact of the Case", "Ratio", "Judgment"]:
                if case[field_key] and case[field_key] in case["case_details"]:
                    # Remove the duplicate content from case_details
                    case["case_details"] = clean_text(case["case_details"].replace(case[field_key], ""))
            
            # If after removing all duplicates, we have an empty case_details, set to None
            if not case["case_details"] or case["case_details"].strip() == "":
                case["case_details"] = None
    
    return merged_cases


docx_file = "../../Raw_data/100LandmarkCases.docx"

output_file = "../../Processed_Data/PreviousCases/landmark_cases.json"

# Extract and merge case data
cases = extract_and_merge(docx_file)

# Save to JSON
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(cases, f, indent=4, ensure_ascii=False)

print(f"Successfully processed {len(cases)} cases. Data saved to {output_file}")

Successfully processed 143 cases. Data saved to landmark_cases.json


## Divide in Multiple files

In [11]:
import os
import re
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_JUSTIFY

def clean_filename(text):
    """Clean text to make it suitable for a filename"""
    # Remove invalid filename characters including newlines
    clean = re.sub(r'[\\/*?:"<>|\n\r\t]', " ", text)
    # Replace multiple spaces with a single space
    clean = re.sub(r'\s+', " ", clean)
    # Replace spaces with underscores
    clean = clean.strip().replace(' ', '_')
    # Limit filename length
    if len(clean) > 100:
        clean = clean[:100]
    return clean

def count_words(content_items):
    """Count total words in content items, excluding headings"""
    word_count = 0
    for item in content_items:
        if item["style"] == "BodyText":  # Only count body text, not headings
            word_count += len(item["text"].split())
    return word_count

def extract_and_save_sections(docx_path, output_folder, min_word_count=30):
    """Extract sections from Word document and save as separate PDFs"""
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Load the document
    doc = Document(docx_path)
    
    # Initialize variables
    current_h1 = None
    current_h2 = None
    content = []
    sections = {}  # Dictionary to store content of each section
    
    # Process paragraphs
    for para in doc.paragraphs:
        # Skip empty paragraphs
        if not para.text.strip():
            continue
        
        # Check paragraph style
        style_name = para.style.name if para.style else "Normal"
        
        if style_name == "Heading 1":
            current_h1 = para.text.strip()
            # Start fresh content list for new H1
            content = []
        elif style_name == "Heading 2" and current_h1:
            # If we already had an H2, save its content
            if current_h2 and content:
                section_name = f"{clean_filename(current_h2)}_{clean_filename(current_h1)}"
                sections[section_name] = content
            
            # Start new H2 section
            current_h2 = para.text.strip()
            content = [{"text": current_h2, "style": "H2Style"}]
        elif current_h1 and current_h2:
            # Add content to current section
            if style_name == "Heading 3":
                content.append({"text": para.text, "style": "H3Style"})
            else:
                # Handle normal text or other styles
                content.append({"text": para.text, "style": "BodyText"})
    
    # Save the last section
    if current_h1 and current_h2 and content:
        section_name = f"{clean_filename(current_h2)}_{clean_filename(current_h1)}"
        sections[section_name] = content
    
    # Create PDFs for each section that has enough content
    skipped_count = 0
    created_count = 0
    
    for section_name, section_content in sections.items():
        # Check if the section has enough content
        word_count = count_words(section_content)
        
        if word_count >= min_word_count:
            create_pdf(section_name, section_content, output_folder)
            created_count += 1
        else:
            skipped_count += 1
            print(f"Skipping {section_name}.pdf - only {word_count} words (minimum is {min_word_count})")
    
    print(f"Created {created_count} PDFs, skipped {skipped_count} sections with insufficient content")

def create_pdf(filename, content, output_folder):
    """Create a PDF file with the given content"""
    pdf_path = os.path.join(output_folder, f"{filename}.pdf")
    
    try:
        # Create PDF document
        doc = SimpleDocTemplate(pdf_path, pagesize=letter)
        
        # Set up styles
        styles = getSampleStyleSheet()
        
        # Create custom styles with unique names
        h2_style = ParagraphStyle(
            name='H2Style',
            fontName='Helvetica-Bold',
            fontSize=14,
            alignment=TA_CENTER,
            spaceAfter=12
        )
        
        h3_style = ParagraphStyle(
            name='H3Style',
            fontName='Helvetica-Bold',
            fontSize=12,
            spaceAfter=10
        )
        
        body_style = ParagraphStyle(
            name='BodyText',
            fontName='Helvetica',
            fontSize=10,
            alignment=TA_JUSTIFY,
            spaceAfter=8
        )
        
        # Create PDF content
        pdf_content = []
        
        for item in content:
            if item["style"] == "H2Style":
                pdf_content.append(Paragraph(item["text"], h2_style))
                pdf_content.append(Spacer(1, 12))
            elif item["style"] == "H3Style":
                pdf_content.append(Paragraph(item["text"], h3_style))
                pdf_content.append(Spacer(1, 8))
            else:
                pdf_content.append(Paragraph(item["text"], body_style))
                pdf_content.append(Spacer(1, 6))
        
        # Build PDF
        doc.build(pdf_content)
        print(f"Created: {pdf_path}")
    except Exception as e:
        print(f"Error creating {pdf_path}: {str(e)}")

if __name__ == "__main__":
    # File paths
    docx_file = "../../../Raw_data/100LandmarkCases.docx"
    output_folder = "../../../Processed_Data/PreviousCases/landmark_cases"
    
    # Minimum word count for content (excluding headings)
    min_words = 30
    
    # Extract and save sections
    extract_and_save_sections(docx_file, output_folder, min_words)
    print(f"Successfully processed document and saved PDFs to {output_folder}")

Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Reorganisation_Of_States_(_Article_3)_Part_I_The_Union_and_its_Territory_(Article_1_–_4).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Development_of_The_Concept_Of_State_Under_Article_12_Part_III_Fundamental_Rights_(Article_12_–_35).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Personal_Law_(Article_13)_Part_III_Fundamental_Rights_(Article_12_–_35).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Doctrine_Of_Eclipse_Part_III_Fundamental_Rights_(Article_12_–_35).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Doctrine_Of_Severability_Part_III_Fundamental_Rights_(Article_12_–_35).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Doctrine_Of_Waiver_Part_III_Fundamental_Rights_(Article_12_–_35).pdf
Created: ../../../Processed_Data/PreviousCases/landmark_cases/Topic_Evolution_Of_Concept_Of_Equality_(Arti

## Works better

In [8]:
import docx
import re
import json

def clean_text(text):
    """
    Cleans up extra whitespace and newlines from text.
    """
    return " ".join(text.split())

def extract_data_from_docx(docx_file):
    """
    Extracts structured data from a DOCX file.

    Args:
        docx_file: Path to the DOCX file.

    Returns:
        A list of dictionaries, where each dictionary represents a topic.
    """

    doc = docx.Document(docx_file)
    full_text = ""
    for paragraph in doc.paragraphs:
        full_text += paragraph.text + "\n"

    topics = []
    topic_pattern = re.compile(r"Topic:\s*(.*?)(?=Topic:|\Z)", re.DOTALL)
    topic_matches = topic_pattern.findall(full_text)

    for topic_text in topic_matches:
        topic_dict = {"Topic": None, "Name of the case": None, "Issue": None, "Bench": None, "Fact of the Case": None, "Ratio": None, "Judgment": None}
        lines = topic_text.strip().split('\n')

        for line in lines:
            line = line.strip()
            if line.startswith("Topic:"):
                topic_dict["Topic"] = clean_text(line.replace("Topic:", "").strip())
            elif line.startswith("Name of the case:"):
                topic_dict["Name of the case"] = clean_text(line.replace("Name of the case:", "").strip())
            elif line.startswith("Issue:"):
                topic_dict["Issue"] = clean_text(line.replace("Issue:", "").strip())
            elif line.startswith("Bench:"):
                topic_dict["Bench"] = clean_text(line.replace("Bench:", "").strip())
            elif line.startswith("Fact of the Case:"):
                topic_dict["Fact of the Case"] = clean_text(line.replace("Fact of the Case:", "").strip())
            elif line.startswith("Ratio:"):
                topic_dict["Ratio"] = clean_text(line.replace("Ratio:", "").strip())
            elif line.startswith("Judgment:"):
                topic_dict["Judgment"] = clean_text(line.replace("Judgment:", "").strip())
            else:
                # If it's not a known field, it might be part of a larger text block (like Fact of the Case or Ratio)
                # We'll append it to existing content if available
                if topic_dict["Fact of the Case"] is not None:
                    topic_dict["Fact of the Case"] += " " + clean_text(line)
                elif topic_dict["Ratio"] is not None:
                    topic_dict["Ratio"] += " " + clean_text(line)
                elif topic_dict["Judgment"] is not None:
                    topic_dict["Judgment"] += " " + clean_text(line)
                # Handle cases where Topic is spread across multiple lines
                elif topic_dict["Topic"] is not None:
                    topic_dict["Topic"] += " " + clean_text(line)
                # If no field has been identified, but there is content, assume it to be topic.
                elif topic_dict["Topic"] is None and line:
                    topic_dict["Topic"] = clean_text(line)


        topics.append(topic_dict)

    return topics

# --- Example Usage ---
# docx_file = "100LandmarkCases.docx"  # Replace with the actual path to your DOCX file
docx_file = "../../Raw_data/100LandmarkCases.docx"

structured_data = extract_data_from_docx(docx_file)

print(json.dumps(structured_data, indent=4))

[
    {
        "Topic": "Preamble As Part Of The Constitution Name of the Case: Berubari Union Case, AIR 1960 SC 845 Fact of the case: In the Berubari Union Case, the President consulted with the Supreme Court of India regarding the Nehru-Noon Agreement that was signed between the Prime Minister of India and the Prime Minister of Pakistan. The case concerned the disputed territory of Berubari, which was located on the border between India and Pakistan. The dispute was that the West Bengal State Government did not want to give any territory of Berubari to Pakistan.",
        "Name of the case": null,
        "Issue": null,
        "Bench": "Justice B. Sinha, Justice A.S. Shah, Justice K. Dasgupta, Justice K.S. Rao, Justice M. Hidayatullah, Justice P. Gajendragadkar, Justice S Das",
        "Fact of the Case": null,
        "Ratio": "The Supreme Court held that since preamble is a part of Constitution, the court stated that the opinion tendered by it in the Beru Bari Union (1960) in thi

In [2]:
import pandas as pd

df = pd.read_json("hf://datasets/nisaar/Articles_Constitution_3300_Instruction_Set/Article_12_14_15_19_21_Instructionset_train.jsonl", lines=True)


df.to_json("../../../EvalutionDataset/Articles_Constitution_3300_Instruction_Set.jsonl", orient="records", lines=True)


In [3]:
import pandas as pd

df = pd.read_json("hf://datasets/nisaar/Lawyer_GPT_India/150_lawergpt_dataset_qna_v1_train.jsonl", lines=True)


df.to_json("../../../150_lawergpt_dataset_qna_v1_train.json", orient="records", lines=True)
