In [1]:
%pip install selenium
%pip install webdriver-manager





In [2]:
import os
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# Define directories for downloads and metadata
download_dir = "E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\PDFs"
metadata_dir = "E:\\SCP Judgements\\Small Corpus (Metadata incl.)\\Metadata"
os.makedirs(metadata_dir, exist_ok=True)  # Ensure the metadata directory exists
metadata_file_path = os.path.join(metadata_dir, "judgements_metadata.csv")

# Set up Chrome options
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {
    'download.default_directory': download_dir,
    'download.prompt_for_download': False,
    'plugins.always_open_pdf_externally': True
})

# Start the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
URL = "https://www.supremecourt.gov.pk/judgement-search/#1573035933449-63bb4a39-ac81"
driver.get(URL)
driver.implicitly_wait(3)

# Function to write metadata to CSV
def write_metadata_to_csv(metadata, file_path):
    with open(file_path, mode="a", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(metadata)

# Create CSV with headers for metadata if not already created
if not os.path.exists(metadata_file_path):
    write_metadata_to_csv(['PDF Title', 'Case Subject', 'Case No', 'Case Title', 'Author Judge', 'Judgment Date'], metadata_file_path)

# Select dropdowns for filtering
def select_filters():
    try:
        # Select "Reported" as "Yes"
        reported_dropdown = Select(driver.find_element(By.ID, "reported"))
        reported_dropdown.select_by_value("yes")
        
        # Wait briefly after selection
        time.sleep(1)
    except Exception as e:
        print(f"Error selecting 'Reported' filter: {e}")

# Function to process pages based on the year
def process_pages(year, total_pages):
    try:
        # Select the year
        year_dropdown = Select(driver.find_element(By.ID, "case_year"))
        year_dropdown.select_by_value(str(year))
        
        # Wait for page to load after selecting the year
        time.sleep(2)
        
        # Click the "Search Result" button
        search_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@type='button' and @value='Search Result']"))
        )
        search_button.click()
        time.sleep(5)  # Wait for search results to load
        
        # Loop through pages
        for page_number in range(1, total_pages + 1):
            print(f"Processing page {page_number} for year {year}...")
            
            # Locate the rows in the results table
            rows = driver.find_elements(By.XPATH, "//table[@id='resultsTable']/tbody/tr")
            for row in rows:
                try:
                    # Extract metadata from each row
                    case_subject = row.find_element(By.XPATH, "./td[2]").text
                    case_no = row.find_element(By.XPATH, "./td[3]").text
                    case_title = row.find_element(By.XPATH, "./td[4]").text
                    author_judge = row.find_element(By.XPATH, "./td[5]").text
                    judgment_date = row.find_element(By.XPATH, "./td[7]").text
                    
                    # Locate the PDF download link
                    pdf_link = row.find_element(By.XPATH, "./td[10]/a")
                    pdf_title = pdf_link.get_attribute("href").split('/')[-1]  # Extract the PDF file name from URL
                    pdf_link.click()  # Trigger the download

                    # Write metadata to CSV, passing the file path
                    write_metadata_to_csv([pdf_title, case_subject, case_no, case_title, author_judge, judgment_date], metadata_file_path)

                    time.sleep(0.5)  # Short delay between downloads
                    
                except Exception as e:
                    continue
            
            # Navigate to the next page if more pages exist
            if page_number < total_pages:
                next_page_link = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.LINK_TEXT, str(page_number + 1)))
                )
                next_page_link.click()
                time.sleep(5)  # Wait for the new page to load
    except Exception as e:
        print(f"Error processing year {year} pages: {e}")

# Execute the script with filters and page processing
try:
    select_filters()  # Apply 'Reported' filter
    process_pages(2024, 1)  # Process pages for 2024
    process_pages(2023, 3)  # Process pages for 2023
    process_pages(2022, 5)  # Process pages for 2022
finally:
    # driver.quit()  # Close the driver after completion
    print("Script completed. Metadata saved to:", metadata_file_path)

Processing page 1 for year 2024...
Processing page 1 for year 2023...
Processing page 2 for year 2023...
Processing page 3 for year 2023...
Processing page 1 for year 2022...
Processing page 2 for year 2022...
Processing page 3 for year 2022...
Processing page 4 for year 2022...
Processing page 5 for year 2022...
Script completed. Metadata saved to: E:\SCP Judgements\Small Corpus (Metadata incl.)\Metadata\judgements_metadata.csv
