<a href="https://colab.research.google.com/github/ianellisjones/usn/blob/main/LHA_LHD_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
US Navy Amphibious Fleet Scraper (LHA/LHD)

A utility to scrape deployment history from uscarriers.net and determine the
latest physical location and status of US Navy Amphibious Assault Ships.

Bypasses standard browser truncation by fetching raw HTML and parsing
bottom-up context to handle verbose historical logs.
"""

import csv
import re
import sys
from pathlib import Path
from typing import List, Tuple, Dict, Optional

import requests
from bs4 import BeautifulSoup

# Configuration
SHIP_URLS: List[str] = [
    "http://uscarriers.net/lhd1history.htm", # USS Wasp
    "http://uscarriers.net/lhd3history.htm", # USS Kearsarge
    "http://uscarriers.net/lhd5history.htm", # USS Bataan
    "http://uscarriers.net/lhd7history.htm", # USS Iwo Jima
    "http://uscarriers.net/lhd2history.htm", # USS Essex
    "http://uscarriers.net/lhd4history.htm", # USS Boxer
    "http://uscarriers.net/lhd8history.htm", # USS Makin Island
    "http://uscarriers.net/lha7history.htm", # USS Tripoli
    "http://uscarriers.net/lha6history.htm", # USS America
]

OUTPUT_FILENAME = "amphib_status.csv"
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

def fetch_history_text(url: str, char_limit: int = 50000) -> str:
    """
    Fetches the raw HTML content, strips tags, and returns the tail of the text.

    Args:
        url: The URL to scrape.
        char_limit: Number of characters to retrieve from the end of the file.

    Returns:
        Cleaned text string from the bottom of the page.
    """
    try:
        response = requests.get(url, headers={'User-Agent': USER_AGENT}, timeout=20)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        full_text = soup.get_text(separator='\n')

        # Normalize whitespace
        lines = [line.strip() for line in full_text.split('\n') if line.strip()]
        clean_text = '\n'.join(lines)

        return clean_text[-char_limit:] if len(clean_text) > char_limit else clean_text

    except requests.RequestException as e:
        return f"ERROR: {str(e)}"


def parse_status_entry(text_block: str) -> Tuple[str, str]:
    """
    Parses the text block to identify the most recent status entry based on
    year context and naval movement keywords.
    """
    lines = text_block.split('\n')

    # Contextual year tracking
    current_year = "Unknown"
    years_found = re.findall(r'(202[3-6])', text_block)

    if years_found:
        # Prioritize current operational years over future projection dates
        priority_years = [y for y in years_found if y in ['2024', '2025']]
        current_year = priority_years[-1] if priority_years else years_found[-1]

    processed_lines = []
    running_year = current_year

    for line in lines:
        # Update context if line starts with a relevant year
        year_match = re.search(r'^202[3-6]', line)
        if year_match:
            running_year = year_match.group(0)

        processed_lines.append({'text': line, 'year': running_year})

    # Naval movement keywords
    keywords = [
        "moored", "anchored", "underway", "arrived", "departed",
        "transited", "operations", "returned", "participated", "conducted",
        "moved to", "visited", "pulled into", "sea trials", "flight deck certification"
    ]

    allowed_years = ["2024", "2025", "2026"]

    # Bottom-up search for latest valid entry
    for entry in reversed(processed_lines):
        text_lower = entry['text'].lower()
        year = entry['year']

        if year in allowed_years and any(k in text_lower for k in keywords):
            # Exclude summary ranges (e.g. "From Jan - Mar")
            if text_lower.strip().startswith("from ") and " - " in text_lower:
                continue
            return year, entry['text']

    return current_year, "No status found."


def categorize_location(text: str) -> str:
    """Derives a high-level location tag from the detailed status text."""
    text = text.lower()

    location_map = {
        # Ports
        "Norfolk / Portsmouth": ["norfolk", "portsmouth", "virginia beach", "nassco"],
        "San Diego": ["san diego", "north island", "camp pendleton"],
        "Bremerton / Kitsap": ["bremerton", "kitsap"],
        "Newport News": ["newport news"],
        "Yokosuka": ["yokosuka"],
        "Pearl Harbor": ["pearl harbor"],
        "Mayport": ["mayport"],
        "Everett": ["everett"],
        "Singapore": ["singapore", "changi"],
        "Bahrain": ["bahrain", "manama"],
        "Dubai": ["dubai", "jebel ali"],
        "Busan": ["busan"],
        "Guam": ["guam", "apra"],
        "Sasebo": ["sasebo", "juliet basin"],
        "Malaysia": ["malaysia", "klang"],
        "Philippines": ["philippines", "manila", "subic"],
        "Pascagoula": ["pascagoula"],
        # Regions
        "South China Sea": ["south china sea"],
        "Philippine Sea": ["philippine sea", "okinawa"],
        "Red Sea": ["red sea"],
        "Persian Gulf": ["persian gulf", "arabian gulf"],
        "Mediterranean": ["mediterranean"],
        "Caribbean": ["caribbean", "st. croix", "trinidad", "tobago"],
        "North Sea": ["north sea"],
        "Atlantic Ocean": ["atlantic"],
        "Pacific Ocean": ["pacific"],
        "Indian Ocean": ["indian ocean"],
    }

    for label, keywords in location_map.items():
        if any(k in text for k in keywords):
            return label

    return "Underway / Unknown"


def extract_date(text: str) -> str:
    """Extracts the last specific date (Month Day) mentioned in the text."""
    pattern = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2}'
    matches = re.findall(pattern, text, re.IGNORECASE)
    return matches[-1] if matches else "Date Unspecified"


def main():
    print(f"{'='*90}")
    print(f"US NAVY AMPHIBIOUS FLEET - LATEST LOCATION TRACKER")
    print(f"{'='*90}\n")

    results: List[Dict[str, str]] = []

    for url in SHIP_URLS:
        # Extract hull number from URL (Matches LHD or LHA)
        hull_match = re.search(r'(lh[ad]\d+)', url)
        hull = hull_match.group(1).upper() if hull_match else "UNK"

        raw_text = fetch_history_text(url)

        if "ERROR" in raw_text:
            year, status, loc_tag, date_str = "Error", raw_text, "Error", "Error"
        else:
            year, status = parse_status_entry(raw_text)
            loc_tag = categorize_location(status)
            date_str = extract_date(status)

            # Use Year as fallback if date is unspecified
            if date_str == "Date Unspecified":
                date_str = year

        results.append({
            "Hull": hull,
            "Location": loc_tag,
            "Date": date_str,
            "Status Sentence": status,
            "Source URL": url
        })

        # Console Output
        print(f"[{hull}] [{loc_tag}] [{date_str}] {status}")

    # Write to CSV
    try:
        output_path = Path(OUTPUT_FILENAME)
        with output_path.open(mode='w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ["Hull", "Location", "Date", "Status Sentence", "Source URL"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(results)

        print(f"\n{'='*90}")
        print(f"SUCCESS: Report saved to '{output_path.absolute()}'")
        print(f"{'='*90}")

    except PermissionError:
        print(f"\nERROR: Could not write to {OUTPUT_FILENAME}. Is the file open?")

if __name__ == "__main__":
    main()

US NAVY AMPHIBIOUS FLEET - LATEST LOCATION TRACKER

[LHD1] [Norfolk / Portsmouth] [May 19] April 14, USS Wasp moored at Berth 6, Pier 11 on Naval Station Norfolk after a 10-day underway for deck landing qualifications, in the Virginia Capes Op. Area; Underway again on April 28; Moored at Berth 6, Pier 6 on May 9; Moved "dead-stick" to Pier 1 in BAE Systems shipyard on May 19.
[LHD3] [Norfolk / Portsmouth] [Nov. 14] September 26, USS Kearsarge moved from Pier 12 to Berth 3, Pier 14 on Naval Station Norfolk; Moved  to Berth 4, Pier 14 on Sept. 28; Underway again on Oct. 8; Moored at Berth 1, Pier 12 on Oct. 10; Underway in the Virginia Capes Op Area from Oct. 16-22 and Nov. 14-21.
[LHD5] [Norfolk / Portsmouth] [2025] July ?, 2025 USS Bataan undocked and moored at  Berth 2E on NASSCO shipyard.
[LHD7] [Caribbean] [Nov. 18] November 6, The Iwo Jima moored at Ann E. Abramson Marine Facility in Frederiksted,  St. Croix, U.S. Virgin Islands, for a four-day port call; Conducted a replenishment-