In [None]:
import os
import json
import time
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import logging
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('scraping.log'),
        logging.StreamHandler()])
logger = logging.getLogger(__name__)
class IndianKanoonScraper:
    def __init__(self, base_dir: str = "dataset_raw"):
        self.base_dir = Path(base_dir)
        self.base_url = "https://indiankanoon.org"
        self.search_configs = {}        
        courts = {
            "supreme_court": {
                "url_key": "supremecourt", "prefix": "SC",
                "years": range(2000, 2025)},
            "delhi_high_court": {
                "url_key": "delhi", "prefix": "DHC",
                "years": range(2000, 2025)},
            "bombay_high_court": {
                "url_key": "bombay", "prefix": "BHC",
                "years": range(2000, 2025)},
            "calcutta_high_court": {
                "url_key": "kolkata", "prefix": "CHC",
                "years": [y for y in range(2000, 2025) if y != 2012]},
            "allahabad_high_court": {
                "url_key": "allahabad", "prefix": "AHC",
                "years": [y for y in range(2000, 2025) if y != 2009]},
            "madras_high_court": {
                "url_key": "chennai", "prefix": "MHC",
                "years": range(2000, 2025)}}        
        for court_folder, config in courts.items():
            for year in config["years"]:
                key = f"{court_folder}_{year}"
                self.search_configs[key] = {
                    "url": f"https://indiankanoon.org/search/?formInput=doctypes:{config['url_key']}%20year:{year}",
                    "folder": court_folder, "prefix": f"{config['prefix']}_{year}"}
        self.setup_directories()        
    def setup_directories(self):
        self.base_dir.mkdir(exist_ok=True)
        (self.base_dir / "supreme_court").mkdir(exist_ok=True)
        (self.base_dir / "delhi_high_court").mkdir(exist_ok=True)
        (self.base_dir / "bombay_high_court").mkdir(exist_ok=True)
        (self.base_dir / "calcutta_high_court").mkdir(exist_ok=True)
        (self.base_dir / "allahabad_high_court").mkdir(exist_ok=True)
        (self.base_dir / "madras_high_court").mkdir(exist_ok=True)
        logger.info(f"Created processed directory structure in {self.base_dir}")
    def init_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')        
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        return driver
    def get_case_links_from_page(self, driver) -> List[str]:
        links = []
        try:
            time.sleep(2)
            try:
                results = driver.find_elements(By.CLASS_NAME, "result_title")
                for result in results:
                    try:
                        link = result.find_element(By.TAG_NAME, "a").get_attribute("href")
                        if link and '/doc/' in link:
                            links.append(link)
                    except:
                        continue
            except:
                pass            
            if not links:
                all_links = driver.find_elements(By.TAG_NAME, "a")
                for element in all_links:
                    href = element.get_attribute("href")
                    if href and '/doc/' in href and 'indiankanoon.org/doc/' in href:
                        if href not in links:
                            links.append(href)            
            if not links:
                logger.warning("No links found on page. Saving page source for debugging.")
                with open("debug_page.html", "w", encoding="utf-8") as f:
                    f.write(driver.page_source)
                logger.warning("Page source saved to debug_page.html")
        except Exception as e:
            logger.error(f"Error extracting links from page: {e}")
        return links
    def click_next_page(self, driver) -> bool:
        try:
            time.sleep(2)
            try:
                next_links = driver.find_elements(By.XPATH, "//a[contains(text(), 'Next') or contains(text(), 'next')]")
                if next_links:
                    next_links[0].click()
                    time.sleep(3)
                    return True
            except:
                pass
            try:
                next_links = driver.find_elements(By.XPATH, "//a[contains(text(), '›') or contains(text(), '→')]")
                if next_links:
                    next_links[0].click()
                    time.sleep(3)
                    return True
            except:
                pass
            try:
                page_links = driver.find_elements(By.CSS_SELECTOR, "div.browse a, .pagination a")
                current_url = driver.current_url
                for link in page_links:
                    link_text = link.text.strip()
                    if 'next' in link_text.lower() or '›' in link_text:
                        link.click()
                        time.sleep(3)
                        if driver.current_url != current_url:
                            return True
            except:
                pass
            logger.warning("Could not find 'Next' button")
            return False
        except Exception as e:
            logger.error(f"Error clicking next page: {e}")
            return False
    def download_case_html(self, case_url: str, retry_count: int = 3) -> Optional[str]:
        for attempt in range(retry_count):
            try:
                response = requests.get(case_url, timeout=30)
                response.raise_for_status()
                return response.text
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:  
                    wait_time = (attempt + 1) * 5 
                    logger.warning(f"Rate limited. Waiting {wait_time}s before retry {attempt + 1}/{retry_count}")
                    time.sleep(wait_time)
                else:
                    logger.error(f"HTTP Error downloading {case_url}: {e}")
                    return None
            except Exception as e:
                logger.error(f"Error downloading {case_url}: {e}")
                return None
        logger.error(f"Failed to download {case_url} after {retry_count} retries")
        return None
    def scrape_court_year(self, config_key: str, max_cases: int = 1000):
        config = self.search_configs[config_key]
        logger.info(f"Starting scraping for {config_key}")
        driver = self.init_driver()
        case_links = []
        try:
            driver.get(config['url'])
            logger.info(f"Navigated to {config['url']}")
            time.sleep(5) 
            page_num = 1
            while len(case_links) < max_cases:
                logger.info(f"Scraping page {page_num} for {config_key}")
                links = self.get_case_links_from_page(driver)
                case_links.extend(links)
                logger.info(f"Found {len(links)} cases on page {page_num}. Total: {len(case_links)}")
                if page_num == 1 and len(links) == 0:
                    logger.error(f"No results found on first page for {config_key}. Check debug_page.html")
                    break
                if len(case_links) >= max_cases:
                    case_links = case_links[:max_cases]
                    break
                if not self.click_next_page(driver):
                    logger.info(f"No more pages available for {config_key}")
                    break
                page_num += 1
                time.sleep(3) 
        finally:
            driver.quit()        
        logger.info(f"Downloading {len(case_links)} cases for {config_key}")
        successful_downloads = 0
        for idx, link in enumerate(case_links, 1):
            html_content = self.download_case_html(link)
            if html_content:
                filename = f"{config['prefix']}_{idx:04d}.html"
                filepath = self.base_dir / config['folder'] / filename
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(html_content)
                successful_downloads += 1
                logger.info(f"Saved {filename} ({successful_downloads}/{len(case_links)})")
            time.sleep(2) 
        logger.info(f"Completed scraping {config_key}: {successful_downloads}/{len(case_links)} cases downloaded successfully")
        return successful_downloads
class LegalCasePreprocessor:    
    def __init__(self, raw_dir: str = "dataset_raw", processed_dir: str = "dataset_processed"):
        self.raw_dir = Path(raw_dir)
        self.processed_dir = Path(processed_dir)
        self.setup_directories()
        self.citation_pattern = r'/doc/\d+'    
    def setup_directories(self):
        self.processed_dir.mkdir(exist_ok=True)
        (self.processed_dir / "supreme_court").mkdir(exist_ok=True)
        (self.processed_dir / "delhi_high_court").mkdir(exist_ok=True)
        (self.processed_dir / "bombay_high_court").mkdir(exist_ok=True)
        (self.processed_dir / "calcutta_high_court").mkdir(exist_ok=True)
        (self.processed_dir / "allahabad_high_court").mkdir(exist_ok=True)
        (self.processed_dir / "madras_high_court").mkdir(exist_ok=True)
        logger.info(f"Created processed directory structure in {self.processed_dir}")    
    def extract_metadata(self, soup: BeautifulSoup, html_content: str) -> Dict:
        metadata = {
            'title': '', 'court': '', 'date': '', 
            'citations': [], 'petitioner': '','respondent': ''}        
        title_elem = soup.find('h1', class_='docsource_main')
        if not title_elem:
            title_elem = soup.find('title')
        if not title_elem:
            title_patterns = [r'([A-Z][A-Za-z\s&,\.]+)\s+(?:vs?\.?|versus)\s+([A-Z][A-Za-z\s&,\.]+)\s+on\s+\d{1,2}\s+\w+,?\s+\d{4}',]
            for pattern in title_patterns:
                match = re.search(pattern, html_content[:2000])
                if match:
                    metadata['title'] = match.group(0)
                    break
        if title_elem and not metadata['title']:
            metadata['title'] = title_elem.get_text(strip=True)        
        filename = soup.find('title')
        if filename:
            filename_text = filename.get_text()
            if 'Delhi High Court' in filename_text:
                metadata['court'] = 'Delhi High Court'
            elif 'Supreme Court' in filename_text:
                metadata['court'] = 'Supreme Court of India'
        if not metadata['court']:
            court_patterns = [
                (r'IN\s+THE\s+HIGH\s+COURT\s+OF\s+DELHI\s+AT\s+NEW\s+DELHI', 'Delhi High Court'),
                (r'HIGH\s+COURT\s+OF\s+DELHI\s+AT\s+NEW\s+DELHI', 'Delhi High Court'),
                (r'DELHI\s+HIGH\s+COURT', 'Delhi High Court'),
                (r'IN\s+THE\s+SUPREME\s+COURT\s+OF\s+INDIA', 'Supreme Court of India'),
                (r'SUPREME\s+COURT\s+OF\s+INDIA', 'Supreme Court of India'),]
            search_text = html_content[:2000]
            for pattern, court_name in court_patterns:
                match = re.search(pattern, search_text, re.IGNORECASE)
                if match:
                    metadata['court'] = court_name
                    break
        if metadata['title']:
            date_in_title = re.search(r'on\s+(\d{1,2}\s+\w+,?\s+\d{4})', metadata['title'])
            if date_in_title:
                date_str = date_in_title.group(1).replace(',', '')
                metadata['date'] = self.normalize_date(date_str)        
        if not metadata['date']:
            date_patterns = [
                r'(?:Decided\s+On|Decided|Date\s+of\s+Judgment|Judgment\s+Date):\s*(\d{1,2}[/-]\d{1,2}[/-]\d{4})',
                r'(?:Decided\s+On|Decided|Date\s+of\s+Judgment):\s*(\d{1,2}\s+\w+,?\s+\d{4})',
                r'(\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s+\d{4})']
            for pattern in date_patterns:
                match = re.search(pattern, html_content[:3000], re.IGNORECASE)
                if match:
                    metadata['date'] = self.normalize_date(match.group(1))
                    break        
        citation_elements = soup.find_all('a', href=re.compile(r'/doc/\d+'))
        seen_citations = set()
        for elem in citation_elements:
            citation = elem.get_text(strip=True)
            if (citation and 
                citation not in seen_citations and 
                len(citation) > 15 and
                len(citation) < 200 and
                not citation.startswith('Section') and
                not citation.startswith('Article') and
                'vs' in citation.lower() or 'v.' in citation.lower()):
                metadata['citations'].append(citation)
                seen_citations.add(citation)
                if len(metadata['citations']) >= 20: 
                    break        
        if metadata['title']:
            vs_pattern = r'^(.+?)\s+(?:vs?\.?|versus)\s+(.+?)\s+on\s+\d'
            match = re.search(vs_pattern, metadata['title'], re.IGNORECASE)
            if match:
                metadata['petitioner'] = match.group(1).strip()
                metadata['respondent'] = match.group(2).strip()
        return metadata    
    def normalize_date(self, date_str: str) -> str:
        try:
            for fmt in ['%d %B %Y', '%d-%m-%Y', '%d/%m/%Y', '%Y-%m-%d', '%Y/%m/%d']:
                try:
                    dt = datetime.strptime(date_str, fmt)
                    return dt.strftime('%Y-%m-%d')
                except ValueError:
                    continue
        except Exception as e:
            logger.warning(f"Could not parse date: {date_str}")
        return date_str    
    def clean_text(self, soup: BeautifulSoup) -> str:
        for script in soup(['script', 'style', 'header', 'footer', 'nav']):
            script.decompose()        
        for promo in soup.find_all(['div', 'p'], class_=['ad', 'promo', 'advertisement']):
            promo.decompose()
        main_content = soup.find('div', class_='judgments')
        if not main_content:
            main_content = soup.find('div', id='div1')
        if not main_content:
            main_content = soup.find('body')
        if main_content:
            text = main_content.get_text(separator='\n')
        else:
            text = soup.get_text(separator='\n')        
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        text = '\n'.join(lines)
        promo_start_patterns = [
            r'^Take notes as you read a judgment using our.*?free trial for one month\.',
            r'^Take notes as you read a judgment.*?one month\.',]
        for pattern in promo_start_patterns:
            text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)        
        promo_patterns = [
            r'Take notes as you read a judgment.*?Try out our Premium Member Services.*?one month\.',
            r'Virtual Legal Assistant.*?Query Alert Service.*?Premium Member Services', r'Try out our Premium Member.*?free trial.*?one month',
            r'Sign up today and get free trial for one month', r'Premium Member Services\s*--\s*Sign up today',
            r'Print Page.*?Email Page', r'Cite.*?Print.*?Email',]
        for pattern in promo_patterns:
            text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        text = re.sub(r' +', ' ', text)
        text = text.strip()        
        return text    
    def process_case(self, html_path: Path) -> Optional[Dict]:
        try:
            with open(html_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
            soup = BeautifulSoup(html_content, 'html.parser')            
            court_folder = html_path.parent.name
            if court_folder == 'supreme_court':
                court_name = 'Supreme Court of India'
            elif court_folder == 'delhi_high_court':
                court_name = 'Delhi High Court'
            elif court_folder == 'bombay_high_court':
                court_name = 'Bombay High Court'
            elif court_folder == 'calcutta_high_court':
                court_name = 'Calcutta High Court'
            elif court_folder == 'allahabad_high_court':
                court_name = 'Allahabad High Court'
            elif court_folder == 'madras_high_court':
                court_name = 'Madras High Court'
            else:
                court_name = 'Unknown'
            metadata = self.extract_metadata(soup, html_content)
            metadata['court'] = court_name         
            clean_text = self.clean_text(soup)            
            case_data = {
                'file_name': html_path.name, 'metadata': metadata,
                'text': clean_text, 'text_length': len(clean_text),
                'word_count': len(clean_text.split()),'processed_date': datetime.now().isoformat()}
            return case_data
        except Exception as e:
            logger.error(f"Error processing {html_path}: {e}")
            return None    
    def process_all(self):
        stats = {
            'total_processed': 0, 'by_court': {}, 'by_year': {}, 'errors': 0}
        for court_dir in self.raw_dir.iterdir():
            if not court_dir.is_dir():
                continue
            court_name = court_dir.name
            logger.info(f"Processing {court_name}")
            html_files = list(court_dir.glob('*.html'))
            processed_count = 0
            for idx, html_file in enumerate(html_files, 1):
                try:
                    success = process_single_case(html_file, self.processed_dir)
                    if success:
                        processed_count += 1
                        stats['total_processed'] += 1
                        if processed_count % 100 == 0:
                            logger.info(f"Processed {processed_count}/{len(html_files)} cases from {court_name}")
                    else:
                        stats['errors'] += 1
                except Exception as e:
                    logger.error(f"Error processing {html_file}: {e}")
                    stats['errors'] += 1
            stats['by_court'][court_name] = processed_count
            logger.info(f"Completed processing {processed_count} cases from {court_name}")
        return stats
def process_single_case(html_path: Path, processed_dir: Path) -> bool:
    try:
        with open(html_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, 'lxml')
        court_folder = html_path.parent.name
        court_map = {
            'supreme_court': 'Supreme Court of India',
            'delhi_high_court': 'Delhi High Court',
            'bombay_high_court': 'Bombay High Court',
            'calcutta_high_court': 'Calcutta High Court',
            'allahabad_high_court': 'Allahabad High Court',
            'madras_high_court': 'Madras High Court'}
        court_name = court_map.get(court_folder, 'Unknown')        
        preprocessor = LegalCasePreprocessor()
        metadata = preprocessor.extract_metadata(soup, html_content)
        metadata['court'] = court_name
        clean_text = preprocessor.clean_text(soup)
        case_data = {
            'file_name': html_path.name, 'metadata': metadata,
            'text': clean_text, 'text_length': len(clean_text),
            'word_count': len(clean_text.split()),
            'processed_date': datetime.now().isoformat()}
        json_filename = html_path.stem + '.json'
        json_path = processed_dir / court_folder / json_filename
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(case_data, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        return False
class DatasetAnalyzer:
    def __init__(self, processed_dir: str = "dataset_processed"):
        self.processed_dir = Path(processed_dir)    
    def generate_stats(self) -> Dict:
        stats = {
            'total_cases': 0,'by_court': {},
            'by_year': {}, 'avg_word_count': 0}
        total_words = 0
        for court_dir in self.processed_dir.iterdir():
            if not court_dir.is_dir():
                continue
            court_name = court_dir.name
            stats['by_court'][court_name] = 0
            for json_file in court_dir.glob('*.json'):
                try:
                    with open(json_file, 'r', encoding='utf-8') as f:
                        case_data = json.load(f)
                    stats['total_cases'] += 1
                    stats['by_court'][court_name] += 1                    
                    match = re.search(r'_(20\d{2})_', json_file.name)
                    if match:
                        year = match.group(1)
                        stats['by_year'][year] = stats['by_year'].get(year, 0) + 1
                    total_words += case_data.get('word_count', 0)
                except Exception as e:
                    logger.error(f"Error analyzing {json_file}: {e}")        
        if stats['total_cases'] > 0:
            stats['avg_word_count'] = total_words // stats['total_cases']        
        return stats
    def save_stats_report(self, output_file: str = "dataset_stats.json"):
        stats = self.generate_stats()
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2, ensure_ascii=False)
        logger.info(f"Statistics saved to {output_file}")
        return stats    
    def print_summary(self):
        stats = self.generate_stats()
        print("\n")
        print("Dataset Statistics Summary:")
        print(f"\nTotal Cases: {stats['total_cases']}")
        print(f"\nCases by Court:")
        for court, count in stats['by_court'].items():
            print(f"  {court}: {count}")
        print(f"\nCases by Year:")
        for year, count in stats['by_year'].items():
            print(f"  {year}: {count}")
        print(f"\nAverage Word Count: {stats['avg_word_count']}")
        print("\n")
def print_menu():
    print("\n")
    print("\n[1] Dataset Extraction")
    print("    - Extract legal case documents from the Indian Kanoon website")
    print("\n[2] Preprocessing")
    print("    - Process raw HTML files into structured JSON")
    print("\n[3] Generate Statistics")
    print("    - Analyze and view dataset statistics")
    print("\n[4] Complete Pipeline")
    print("    - Run extraction, preprocessing, and statistics")
    print("\n[0] Exit")
    print("\n")
def extraction_menu():
    print("\n")
    print("Dataset Extraction Options:")
    print("\n[1] Extract from SPECIFIC COURT and YEAR")
    print("[2] Extract from SPECIFIC COURT (all years)")
    print("[3] Extract from ALL COURTS (all years)")
    print("[0] Back to Main Menu")
    print("\n")
def court_selection_menu():
    print("\nAvailable Courts:")
    print("[1] Supreme Court of India")
    print("[2] Delhi High Court")
    print("[3] Bombay High Court")
    print("[4] Calcutta High Court")
    print("[5] Allahabad High Court")
    print("[6] Madras High Court")
def get_court_folder(choice: int) -> Optional[str]:
    court_map = {
        1: "supreme_court",
        2: "delhi_high_court",
        3: "bombay_high_court",
        4: "calcutta_high_court",
        5: "allahabad_high_court",
        6: "madras_high_court"}
    return court_map.get(choice)
def run_extraction(scraper: IndianKanoonScraper):
    while True:
        extraction_menu()
        choice = input("\nEnter your choice: ").strip()
        if choice == '0':
            break
        elif choice == '1':
            court_selection_menu()
            court_choice = input("\nSelect court (1-6): ").strip()
            try:
                court_choice = int(court_choice)
                court_folder = get_court_folder(court_choice)
                if not court_folder:
                    print("Invalid court selection.")
                    continue
                year_input = input("Enter year(s) (2000-2024, comma-separated for multiple): ").strip()
                years = []
                try:
                    year_parts = [y.strip() for y in year_input.split(',')]
                    for y in year_parts:
                        year = int(y)
                        if year < 2000 or year > 2024:
                            print(f"Invalid year {year}! Must be between 2000-2024")
                            continue
                        if court_folder == "calcutta_high_court" and year == 2012:
                            print(f"Skipping {year}: Data not available for Calcutta High Court 2012")
                            continue
                        if court_folder == "allahabad_high_court" and year == 2009:
                            print(f"Skipping {year}: Data not available for Allahabad High Court 2009")
                            continue
                        years.append(year)
                    if not years:
                        print("No valid years to extract.")
                        continue
                    max_cases = input("Maximum cases per year (default 1000): ").strip()
                    max_cases = int(max_cases) if max_cases else 1000
                    print(f"\nStarting extraction for {court_folder} - Years: {years}")
                    stats = {}
                    for year in years:
                        config_key = f"{court_folder}_{year}"
                        print(f"\nExtracting {year}...")
                        count = scraper.scrape_court_year(config_key, max_cases)
                        stats[year] = count
                    print(f"\nExtraction complete for {court_folder}!")
                    print("Summary:", stats)
                except ValueError:
                    print("Invalid year format. Please enter valid numbers.")
                    continue
            except ValueError:
                print("Invalid input. Please enter valid numbers.")
            except KeyError:
                print("Configuration not found.")
        elif choice == '2':
            court_selection_menu()
            court_choice = input("\nSelect court (1-6): ").strip()
            try:
                court_choice = int(court_choice)
                court_folder = get_court_folder(court_choice)
                if not court_folder:
                    print("Invalid court selection.")
                    continue
                max_cases = input("Maximum cases per year (default 1000): ").strip()
                max_cases = int(max_cases) if max_cases else 1000
                print(f"\nStarting extraction for all years of {court_folder}...")
                stats = {}
                for year in range(2000, 2025):
                    if court_folder == "calcutta_high_court" and year == 2012:
                        continue
                    if court_folder == "allahabad_high_court" and year == 2009:
                        continue
                    config_key = f"{court_folder}_{year}"
                    if config_key in scraper.search_configs:
                        print(f"\nExtracting {year}...")
                        count = scraper.scrape_court_year(config_key, max_cases)
                        stats[year] = count
                print(f"\nExtraction completed for {court_folder}.")
                print("Summary:", stats)
            except ValueError:
                print("Invalid input. Please enter valid numbers.")
        elif choice == '3':
            max_cases = input("Maximum cases per year per court (default 1000): ").strip()
            max_cases = int(max_cases) if max_cases else 1000
            print("\nStarting extraction for ALL courts and years...")
            confirm = input("Continue? (yes/no): ").strip().lower()
            if confirm == 'yes':
                stats = {}
                for config_key in scraper.search_configs:
                    print(f"\nExtracting {config_key}...")
                    count = scraper.scrape_court_year(config_key, max_cases)
                    stats[config_key] = count
                print("\nExtraction complete for all courts.")
                print("Summary:", stats)
            else:
                print("Extraction cancelled.")
        else:
            print("Invalid choice. Please try again.")
def run_preprocessing():
    print("\n")
    print("Preprocessing Raw Dataset")
    print("\n")
    preprocessor = LegalCasePreprocessor()
    if not preprocessor.raw_dir.exists():
        print("\nError: 'dataset_raw' directory not found.")
        print("Please run extraction first.")
        return
    raw_count = sum(1 for court_dir in preprocessor.raw_dir.iterdir() 
                    if court_dir.is_dir() 
                    for _ in court_dir.glob('*.html'))
    if raw_count == 0:
        print("\nNo raw HTML files found in 'dataset_raw' directory.")
        print("Please run extraction first.")
        return
    print(f"\nFound {raw_count} raw HTML files to process.")
    confirm = input("\nStart preprocessing? (yes/no): ").strip().lower()
    if confirm == 'yes':
        print("\nStarting preprocessing...")
        stats = preprocessor.process_all()
        print("\n")
        print("Preprocessing Completed.")
        print(f"\nTotal Processed: {stats['total_processed']}")
        print(f"Errors: {stats['errors']}")
        print(f"\nProcessed by Court:")
        for court, count in stats['by_court'].items():
            print(f"  {court}: {count}")
        print("\n")
    else:
        print("Preprocessing cancelled.")
def run_statistics():
    print("\n")
    print("Dataset Statistics")
    print("\n")
    analyzer = DatasetAnalyzer()
    if not analyzer.processed_dir.exists():
        print("\nError: 'dataset_processed' directory not found.")
        print("Please run preprocessing first.")
        return    
    processed_count = sum(1 for court_dir in analyzer.processed_dir.iterdir() 
                          if court_dir.is_dir() 
                          for _ in court_dir.glob('*.json'))
    if processed_count == 0:
        print("\nNo processed JSON files found in 'dataset_processed' directory.")
        print("Please run preprocessing first.")
        return
    print(f"\nFound {processed_count} processed files.")
    print("\nGenerating statistics...")
    analyzer.save_stats_report()
    analyzer.print_summary()
def main():
    scraper = IndianKanoonScraper()
    while True:
        print_menu()
        choice = input("\nEnter your choice: ").strip()
        if choice == '0':
            print("\nThe system ends.")
            break
        elif choice == '1':
            run_extraction(scraper)
        elif choice == '2':
            run_preprocessing()
        elif choice == '3':
            run_statistics()
        elif choice == '4':
            print("\n")
            print("Complete Pipeline -")
            print("\nThis will run:")
            print("1. Dataset Extraction")
            print("2. Preprocessing")
            print("3. Statistics Generation")
            confirm = input("\nContinue with complete pipeline? (yes/no): ").strip().lower()            
            if confirm == 'yes':
                max_cases = input("\nMaximum cases per year per court (default 1000): ").strip()
                max_cases = int(max_cases) if max_cases else 1000
                print("\n[STEP 1/3] Starting extraction for ALL courts and years...")
                stats = {}
                for config_key in scraper.search_configs:
                    print(f"\nExtracting {config_key}...")
                    count = scraper.scrape_court_year(config_key, max_cases)
                    stats[config_key] = count
                print("\nExtraction completed.")
                print("Summary:", stats)                
                print("\n[STEP 2/3] Starting preprocessing...")
                preprocessor = LegalCasePreprocessor()
                processing_stats = preprocessor.process_all()
                print(f"\nPreprocessing completed. Processed: {processing_stats['total_processed']}, Errors: {processing_stats['errors']}")                
                print("\n[STEP 3/3] Generating statistics...")
                analyzer = DatasetAnalyzer()
                analyzer.save_stats_report()
                analyzer.print_summary()
                print("\nComplete Pipeline Finished Successfully.")
            else:
                print("Pipeline cancelled.")
        else:
            print("Invalid choice. Please try again.")
if __name__ == "__main__":
    main()