In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re  # Import regex for cleaning unwanted text

# Define the base URL of the Supreme Court landmark judgments page
BASE_URL = "https://www.sci.gov.in/landmark-judgment-summaries/"

# Create an empty list to store judgment data
judgments = []

# Function to clean unwanted text
def clean_text(text):
    if text:
        text = text.strip()
        # Remove "Read More", "View Judgment", and unnecessary text
        text = re.sub(r"(Read More|View Judgment)", "", text, flags=re.IGNORECASE)
        # Remove case summaries footnotes and references
        text = re.sub(r"© Supreme Court of India.*", "", text, flags=re.DOTALL)
        return text.strip()
    return "N/A"

# Loop through years from 2000 to 2025
for year in range(2000, 2026):
    print(f"Fetching data for year {year}...")
    
    # Send a request with the selected year
    response = requests.post(BASE_URL, data={"judgment_year": str(year)})
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Locate the judgment summary table
    table = soup.find("div", class_="judgment_summary")
    if table:
        table = table.find("table")
        if table:
            rows = table.find("tbody").find_all("tr")  # Find all table rows
            for index, row in enumerate(rows, start=1):
                columns = row.find_all("td")
                if len(columns) >= 5:  # Ensure there are enough columns
                    
                    # Extract values with cleaning
                    serial_number = clean_text(columns[0].text if columns[0] else "N/A")
                    date_of_judgment = clean_text(columns[1].text if columns[1] else "N/A")
                    case_title = clean_text(columns[2].text if columns[2] else "N/A")
                    subject = clean_text(columns[3].text if columns[3] else "N/A")
                    judgment_summary = clean_text(columns[4].text if columns[4] else "N/A")
                    
                    # Append extracted and cleaned data to the list
                    judgments.append([serial_number, date_of_judgment, case_title, subject, judgment_summary])

# Convert the extracted data into a DataFrame
columns = ["Serial Number", "Date of Judgment", "Cause Title/Case No.", "Subject", "Judgment Summary"]
df = pd.DataFrame(judgments, columns=columns)

# Save to CSV file
df.to_csv("../../Processed_Data/PreviousCases/supreme_court_judgments_cleaned.csv", index=False, encoding='utf-8')

print("Scraping completed. Data saved to supreme_court_judgments_cleaned.csv")


Fetching data for year 2000...
Fetching data for year 2001...
Fetching data for year 2002...
Fetching data for year 2003...
Fetching data for year 2004...
Fetching data for year 2005...
Fetching data for year 2006...
Fetching data for year 2007...
Fetching data for year 2008...
Fetching data for year 2009...
Fetching data for year 2010...
Fetching data for year 2011...
Fetching data for year 2012...
Fetching data for year 2013...
Fetching data for year 2014...
Fetching data for year 2015...
Fetching data for year 2016...
Fetching data for year 2017...
Fetching data for year 2018...
Fetching data for year 2019...
Fetching data for year 2020...
Fetching data for year 2021...
Fetching data for year 2022...
Fetching data for year 2023...
Fetching data for year 2024...
Fetching data for year 2025...
Scraping completed. Data saved to supreme_court_judgments_cleaned.csv
