<a href="https://colab.research.google.com/github/kavivarshini25/Kavivarshini_INFO5731_FALL2025/blob/main/Web_Scraping_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Target page for 2025 cases
base_url = "https://supreme.justia.com"
url = f"{base_url}/cases/federal/us/year/2025.html"

# Fetch the page
response = requests.get(url)
response.encoding = "utf-8"  # handle encoding
soup = BeautifulSoup(response.text, "html.parser")

# Container for scraped data
cases_data = []

# Each case block is inside the "results" section
cases = soup.select("div.results.zebra.has-negative-sides-30.-overflow-hidden > div")

for case in cases:
    try:
        # Case title (clean unwanted characters)
        title_tag = case.find("a")
        case_title = title_tag.get_text(strip=True) if title_tag else None

        # Case URL
        case_url = base_url + title_tag["href"] if title_tag and title_tag.has_attr("href") else None

        # Docket Number
        docket_tag = case.find("strong")
        docket_number = docket_tag.get_text(strip=True) if docket_tag else None

        # Court Name (Justia lists it after docket)
        court_name = None
        strongs = case.find_all("strong")
        if len(strongs) > 1:
            court_name = strongs[1].get_text(strip=True)

        # Date (usually inside <span>)
        date_tag = case.find("span", class_="date")
        date = date_tag.get_text(strip=True) if date_tag else None

        # Visit the case page to get Justia Opinion Summary
        summary_text = None
        if case_url:
            case_page = requests.get(case_url)
            case_page.encoding = "utf-8"
            case_soup = BeautifulSoup(case_page.text, "html.parser")

            summary_tag = case_soup.find("div", class_="case-syllabus")
            if summary_tag:
                summary_text = re.sub(r"\s+", " ", summary_tag.get_text(strip=True))

        # Store in dictionary
        case_info = {
            "Case Title": case_title,
            "Docket Number": docket_number,
            "Court Name": court_name,
            "Date": date,
            "Case URL": case_url,
            "Justia Opinion Summary": summary_text
        }

        # Clean text (remove undesirable symbols)
        for key in case_info:
            if isinstance(case_info[key], str):
                case_info[key] = case_info[key].replace("\xa0", " ").replace("�", "").strip()

        cases_data.append(case_info)

    except Exception as e:
        print(f"Error scraping case: {e}")
        continue

# Save to JSON
output_file = "supreme_court_cases_2025.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(cases_data, f, ensure_ascii=False, indent=4)

print(f"Scraping complete. Saved {len(cases_data)} cases to {output_file}")


Scraping complete. Saved 63 cases to supreme_court_cases_2025.json


In [2]:
from google.colab import files
files.download("supreme_court_cases_2025.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>