<a href="https://colab.research.google.com/github/ianellisjones/usn/blob/main/USNI_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
%%shell
# 1. Clean up old/broken installations
sudo apt-get remove chromium-browser chromium-chromedriver -y > /dev/null 2>&1

# 2. Install dependencies
sudo apt-get update
sudo apt-get install -y wget unzip ca-certificates fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libnspr4 libnss3 lsb-release xdg-utils

# 3. Install Official Google Chrome (Stable)
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo dpkg -i google-chrome-stable_current_amd64.deb
sudo apt-get -f install -y  # Fix any missing deps

# 4. Install Python libs
pip install selenium webdriver-manager beautifulsoup4

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connected to cloud.r-pr                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://cli.github.com/packages stable InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 https://dl.google.com/linux/chrome/deb stable InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubunt



In [10]:
pip install selenium-stealth



In [11]:
"""
US NAVY FLEET TRACKER - USNI NEWS MODULE (v6.1 - Bugfix)

A specialized scraper that reads the "USNI News Fleet and Marine Tracker".
Uses Selenium-Stealth to bypass Cloudflare.
Uses Dictionary Matching to correctly identify locations.

Fixes v6.1:
- Fixed AttributeError by renaming 'header_triggers' to 'location_triggers'.
"""

import re
import csv
import sys
import time
import os
from typing import List, Dict

# External dependencies
try:
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium_stealth import stealth
except ImportError:
    print("CRITICAL ERROR: Missing dependencies.")
    print("Please run: pip install selenium webdriver-manager selenium-stealth")
    sys.exit(1)

# --- CONFIGURATION ---
TARGET_URL = "https://news.usni.org/2025/11/17/usni-news-fleet-and-marine-tracker-nov-17-2025"
OUTPUT_FILENAME = "usni_fleet_status.csv"

# --- LOCATION DICTIONARY ---
KNOWN_LOCATIONS = {
    # Specific Ports
    "Sasebo": ["sasebo"],
    "Yokosuka": ["yokosuka"],
    "Okinawa": ["okinawa"],
    "Busan": ["busan"],
    "Guam": ["guam", "apra"],
    "Pearl Harbor": ["pearl harbor", "hawaii"],
    "San Diego": ["san diego", "north island"],
    "Bremerton": ["bremerton", "kitsap"],
    "Everett": ["everett"],
    "Norfolk": ["norfolk", "portsmouth", "virginia beach", "little creek"],
    "Mayport": ["mayport"],
    "Rota": ["rota", "spain"],
    "Bahrain": ["bahrain", "manama"],
    "Souda Bay": ["souda bay", "crete"],
    "Singapore": ["singapore", "changi"],
    "Manila": ["manila", "subic"],
    "Duqm": ["duqm", "oman"],

    # Regions / Seas
    "Philippine Sea": ["philippine sea"],
    "South China Sea": ["south china sea", "spratly", "paracel"],
    "Western Pacific": ["western pacific", "westpac"],
    "Eastern Pacific": ["eastern pacific"],
    "Atlantic Ocean": ["atlantic"],
    "Mediterranean": ["mediterranean", "ionian", "adriatic"],
    "Red Sea": ["red sea", "bab el-mandeb"],
    "Persian Gulf": ["persian gulf", "arabian gulf"],
    "Gulf of Oman": ["gulf of oman"],
    "Caribbean Sea": ["caribbean", "st. croix", "virgin islands"],
    "North Sea": ["north sea"],
    "Norwegian Sea": ["norwegian sea"]
}

class USNIParser:
    """Parses narrative news text into structured fleet data."""

    def __init__(self):
        # Matches "USS Name (Hull)" or just "USS Name" if Hull is missing
        # Captures: Group 1 = Name, Group 2 = Hull (Optional)
        self.ship_pattern = re.compile(r"USS\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:\s+\(([A-Z]{2,4}-\d+)\))?")

        # Triggers for Section Headers
        # FIXED: Renamed from header_triggers to match usage below
        self.location_triggers = ["In the", "In ", "Off the coast", "Carrier Strike Group", "Amphibious Ready Group"]

    def clean_text(self, text: str) -> str:
        return " ".join(text.split()).strip()

    def identify_location_in_text(self, text: str) -> str:
        """Scans text for known locations from the dictionary."""
        text_lower = text.lower()
        best_match = None
        best_index = -1

        for loc_label, keywords in KNOWN_LOCATIONS.items():
            for k in keywords:
                index = text_lower.rfind(k)
                if index > best_index:
                    best_index = index
                    best_match = loc_label

        return best_match

    def extract_ships_from_section(self, location_context: str, text_block: str) -> List[Dict]:
        ships_found = []

        # Find all ships in the paragraph
        for match in self.ship_pattern.finditer(text_block):
            ship_name = match.group(1)
            hull = match.group(2) if match.group(2) else "Unknown"

            # Get the specific sentence
            sentence = self.get_surrounding_sentence(text_block, match.start())

            # STRATEGY:
            # 1. Look for a specific location in the sentence (e.g. "arrived in Sasebo")
            # 2. If none, use the Section Header (e.g. "In Japan")

            specific_loc = self.identify_location_in_text(sentence)

            if specific_loc:
                final_loc = specific_loc
            else:
                # Clean up the header context (remove "In ", "In the ")
                final_loc = re.sub(r'^(In the |In )', '', location_context).strip()

            # Final fallback for "Global"
            if final_loc in ["Unspecified / Global", "Japan"]:
                if "sasebo" in sentence.lower(): final_loc = "Sasebo"
                if "yokosuka" in sentence.lower(): final_loc = "Yokosuka"

            ships_found.append({
                "Hull": hull,
                "Ship": f"USS {ship_name}",
                "Location": final_loc,
                "Status Sentence": sentence
            })
        return ships_found

    def get_surrounding_sentence(self, text: str, match_index: int) -> str:
        # Find start of sentence
        start = text.rfind('.', 0, match_index) + 1
        # Find end of sentence
        end = text.find('.', match_index)
        if end == -1: end = len(text)
        return text[start:end].strip()

def fetch_with_selenium(url: str) -> str:
    """Launches Headless Chrome with Stealth extensions."""
    print("[*] Launching Google Chrome (Stealth Mode)...")

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
        )

        print(f"[*] Navigating to: {url}")
        driver.get(url)

        print("    > Waiting for Cloudflare check...")
        time.sleep(8) # Wait for redirect

        # Check title to see if we passed
        if "Just a moment" in driver.title:
            print("[!] Still seeing Cloudflare page. Waiting longer...")
            time.sleep(10)

        page_source = driver.page_source
        return page_source

    except Exception as e:
        print(f"[!] Selenium Error: {e}")
        return None
    finally:
        if 'driver' in locals():
            driver.quit()

def parse_article(html_content) -> List[Dict]:
    soup = BeautifulSoup(html_content, 'html.parser')
    parser = USNIParser()
    fleet_data = []
    current_location_context = "Unspecified / Global"

    # Debug
    title = soup.title.string if soup.title else "No Title"
    print(f"    > Page Title: {title}")

    article_body = soup.find('div', class_='entry-content')
    if not article_body: article_body = soup.body

    if article_body:
        # USNI uses <strong> tags inside <p> tags for location headers
        # or plain 3/4 tags. We scan everything.
        all_elements = article_body.find_all(['p', 'h3', 'h4', 'strong'])

        for element in all_elements:
            text = parser.clean_text(element.get_text())
            if not text: continue

            # Check if this element is a LOCATION HEADER
            if any(text.startswith(t) for t in parser.location_triggers) and len(text) < 60:
                current_location_context = text
                continue

            # Extract ships using current context
            ships = parser.extract_ships_from_section(current_location_context, text)
            if ships: fleet_data.extend(ships)

    return fleet_data

def main():
    print(f"{'='*90}")
    print(f"USNI NEWS FLEET TRACKER (STEALTH ENGINE v6.1)")
    print(f"{'='*90}\n")

    html = fetch_with_selenium(TARGET_URL)
    if not html:
        return

    fleet_list = parse_article(html)

    if not fleet_list:
        print("[!] No ships found.")
        return

    # Deduplication logic
    unique_fleet = {}
    for ship in fleet_list:
        # Keep the entry with the most descriptive location (not "Unspecified")
        if ship['Hull'] not in unique_fleet:
            unique_fleet[ship['Hull']] = ship
        else:
            current = unique_fleet[ship['Hull']]
            # If current is unspecified but new one is specific, update it
            if current['Location'] in ["Unspecified / Global", "Japan"] and ship['Location'] not in ["Unspecified / Global", "Japan"]:
                unique_fleet[ship['Hull']] = ship

    final_list = list(unique_fleet.values())

    print(f"[*] Extraction Complete. Found {len(final_list)} unique ships.")

    print(f"\n{'HULL':<10} | {'LOCATION':<25} | {'STATUS SNIPPET'}")
    print(f"{'-'*10}-+-{'-'*25}-+-{'-'*50}")

    for ship in final_list:
        # Clean snippet for display
        snippet = (ship['Status Sentence'][:75] + '..') if len(ship['Status Sentence']) > 75 else ship['Status Sentence']
        print(f"{ship['Hull']:<10} | {ship['Location']:<25} | {snippet}")

    try:
        with open(OUTPUT_FILENAME, 'w', newline='', encoding='utf-8') as f:
            fieldnames = ["Hull", "Ship", "Location", "Status Sentence"]
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(final_list)
        print(f"\n[+] Report saved to '{OUTPUT_FILENAME}'")
    except PermissionError:
        print(f"\n[!] ERROR: Could not write to CSV. Is the file open?")

if __name__ == "__main__":
    main()

USNI NEWS FLEET TRACKER (STEALTH ENGINE v6.1)

[*] Launching Google Chrome (Stealth Mode)...
[*] Navigating to: https://news.usni.org/2025/11/17/usni-news-fleet-and-marine-tracker-nov-17-2025
    > Waiting for Cloudflare check...
    > Page Title: USNI News Fleet and Marine Tracker: Nov. 17, 2025 - USNI News
[*] Extraction Complete. Found 19 unique ships.

HULL       | LOCATION                  | STATUS SNIPPET
-----------+---------------------------+---------------------------------------------------
LHA-7      | Sasebo                    | Amphibious warship USS Tripoli (LHA-7) is in port in Sasebo, Japan
LSD-47     | Okinawa                   | Tripoli will operate with Amphibious Squadron 11 ships USS Rushmore (LSD-47..
LPD-18     | Okinawa                   | Tripoli will operate with Amphibious Squadron 11 ships USS Rushmore (LSD-47..
LPD-22     | Okinawa                   | Tripoli will operate with Amphibious Squadron 11 ships USS Rushmore (LSD-47..
CVN-73     | Philippine Sea 