In [None]:
# Eclipse Data Collection & Organization (RAG Source)
This notebook scrapes NASA eclipse table data, parses it into structured dictionaries organized by **year** and **type**, and exports it as a JSON file for the main project to consume.

In [14]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os

In [18]:
# ============================================================
# PASTE YOUR NASA / WIKIPEDIA ECLIPSE TABLE URL BELOW
# ============================================================
URL = "https://eclipse.gsfc.nasa.gov/SEcat5/SE2001-2100.html"  # <-- Put your link here

# Fetch the page
response = requests.get(URL)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

# Preview the page title to confirm the link works
print("Page title:", soup.title.string if soup.title else "No title found")

Page title: 
Catalog of Solar Eclipses:   2001 to   2100
   


In [19]:
# --- Detect page format: HTML <table> vs <pre> formatted text ---
tables = soup.find_all("table")
pre_blocks = soup.find_all("pre")

print(f"Found {len(tables)} HTML table(s)")
print(f"Found {len(pre_blocks)} <pre> block(s)\n")

# Preview HTML tables (if any have real headers)
for idx, table in enumerate(tables):
    headers = [th.get_text(strip=True) for th in table.find_all("th")]
    if headers:
        print(f"  HTML Table {idx}: {headers[:10]}")

# Preview <pre> blocks (NASA catalogs use this format)
for idx, pre in enumerate(pre_blocks):
    text = pre.get_text()
    lines = [l for l in text.splitlines() if l.strip()]
    print(f"\n  <pre> Block {idx}: {len(lines)} lines")
    # Show first 10 non-empty lines as preview
    for line in lines[:10]:
        print(f"    {line}")

Found 9 HTML table(s)
Found 9 <pre> block(s)

  HTML Table 5: ['21st century CE']
  HTML Table 6: ['Centuries']

  <pre> Block 0: 3 lines
                          Number of Years with 2 Eclipses: 82
                          Number of Years with 3 Eclipses: 12
                          Number of Years with 4 Eclipses:  6

  <pre> Block 1: 2 lines
          4 Saros Series begin [Year/Saros]:   2011/156  2058/157  2069/158  2098/164
          2 Saros Series end   [Year/Saros]:   2054/117  2083/118

  <pre> Block 2: 0 lines

  <pre> Block 3: 0 lines

  <pre> Block 4: 54 lines
                          TD of
    Catalog  Calendar   Greatest          Luna Saros Ecl.               Ecl.            Sun Path  Central
    Number     Date      Eclipse    ΔT     Num  Num  Type QLE  Gamma    Mag.   Lat Long Alt Width   Dur.
                                     s                                          °    °    °   km
    09511  2001 Jun 21  12:04:46     64     18  127   T   -p  -0.5701  1.0495  

In [22]:
# ============================================================
# PARSE THE ECLIPSE DATA FROM <pre> BLOCKS
# ============================================================
# NASA catalog pages store eclipse data as fixed-width text
# inside <pre> tags. The data is spread across MULTIPLE <pre>
# blocks, and each line starts with a catalog number before
# the date.
#
# Line format example:
#   09511  2001 Jun 21  12:04:46  64  18  127  T  -p  -0.5701  1.0495  11S  3E  55  200  04m57s
#
# If your page uses an HTML table instead, set USE_PRE = False.
# ============================================================

USE_PRE = True       # True for NASA catalog pages
TABLE_INDEX = 0      # Only used if USE_PRE = False

raw_rows = []

if USE_PRE:
    # --- Combine ALL <pre> blocks and find eclipse data lines ---
    # The data is split across multiple <pre> blocks on the page.
    all_lines = []
    for pre in pre_blocks:
        all_lines.extend(pre.get_text().splitlines())

    print(f"Total lines across all <pre> blocks: {len(all_lines)}")

    # Eclipse data lines contain a catalog number then a date:
    #   09511  2001 Jun 21  12:04:46 ...
    # Pattern: catalog number (digits), then year, month, day
    date_pattern = re.compile(
        r'^\s*\d+\s+(\d{4})\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2})\s+'
    )

    data_lines = []
    for line in all_lines:
        if date_pattern.match(line):
            data_lines.append(line)

    print(f"Found {len(data_lines)} eclipse data lines.\n")
    print("Sample lines (first 5):")
    for line in data_lines[:5]:
        print(f"  {line}")

    # -------------------------------------------------------
    # NASA SE catalog columns (from the header):
    #   Catalog Number | Calendar Date | Greatest Eclipse (time) |
    #   ΔT | Luna Num | Saros Num | Ecl. Type | QLE |
    #   Gamma | Ecl. Mag. | Lat | Long | Sun Alt |
    #   Path Width (km) | Central Dur.
    # -------------------------------------------------------
    # Token indices after whitespace split:
    #   0: Catalog Number    (e.g. "09511")
    #   1: Year              (e.g. "2001")
    #   2: Month             (e.g. "Jun")
    #   3: Day               (e.g. "21")
    #   4: Time              (e.g. "12:04:46")
    #   5: ΔT               (e.g. "64")
    #   6: Luna Num          (e.g. "18")
    #   7: Saros Num         (e.g. "127")
    #   8: Eclipse Type      (e.g. "T", "A", "An", "P", "H")
    #   9: QLE               (e.g. "-p", "nn", "t-")
    #  10: Gamma             (e.g. "-0.5701")
    #  11: Ecl. Magnitude    (e.g. "1.0495")
    #  12: Latitude          (e.g. "11S")
    #  13: Longitude         (e.g. "3E")
    #  14: Sun Altitude      (e.g. "55")
    #  15: Path Width km     (e.g. "200" or "-")
    #  16: Central Duration  (e.g. "04m57s" or "-")
    # -------------------------------------------------------

    for line in data_lines:
        parts = line.split()
        if len(parts) < 9:
            continue

        row = {}
        row["catalog_number"]        = parts[0]
        row["date"]                  = f"{parts[1]} {parts[2]} {parts[3]}"
        row["time_greatest_eclipse"] = parts[4]
        row["delta_t"]               = parts[5]
        row["lunation_num"]          = parts[6] if len(parts) > 6 else ""
        row["saros"]                 = parts[7] if len(parts) > 7 else ""
        row["type"]                  = parts[8] if len(parts) > 8 else ""
        row["qle"]                   = parts[9] if len(parts) > 9 else ""
        row["gamma"]                 = parts[10] if len(parts) > 10 else ""
        row["magnitude"]             = parts[11] if len(parts) > 11 else ""
        row["latitude"]              = parts[12] if len(parts) > 12 else ""
        row["longitude"]             = parts[13] if len(parts) > 13 else ""
        row["sun_alt"]               = parts[14] if len(parts) > 14 else ""
        row["path_width_km"]         = parts[15] if len(parts) > 15 else ""
        row["central_duration"]      = parts[16] if len(parts) > 16 else ""

        # Geographic region: not on this page's data lines.
        # NASA catalog pages list geographic visibility on separate
        # linked pages per eclipse, not inline.
        row["geographic_region"] = ""

        raw_rows.append(row)

    # Set headers for compatibility with later cells
    headers = list(raw_rows[0].keys()) if raw_rows else []

else:
    # --- Parse HTML table (for Wikipedia or other sites) ---
    target_table = tables[TABLE_INDEX]
    headers = [th.get_text(strip=True) for th in target_table.find_all("th")]
    for tr in target_table.find_all("tr"):
        cells = tr.find_all("td")
        if not cells:
            continue
        row_data = [cell.get_text(strip=True) for cell in cells]
        if len(row_data) == len(headers):
            raw_rows.append(dict(zip(headers, row_data)))

print(f"\n✓ Parsed {len(raw_rows)} total eclipse records.")
print(f"  Columns: {headers}\n")
# Preview first 5
for row in raw_rows[:5]:
    print(row)

Total lines across all <pre> blocks: 293
Found 224 eclipse data lines.

Sample lines (first 5):
  09511  2001 Jun 21  12:04:46     64     18  127   T   -p  -0.5701  1.0495  11S   3E  55  200  04m57s
  09512  2001 Dec 14  20:53:01     64     24  132   A   -n   0.4089  0.9681   1N 131W  66  126  03m53s
  09513  2002 Jun 10  23:45:22     64     30  137   A   nn   0.1993  0.9962  35N 179W  78   13  00m23s
  09514  2002 Dec 04  07:32:16     64     36  142   T   n-  -0.3020  1.0244  39S  60E  72   87  02m04s
  09515  2003 May 31  04:09:22     64     42  147   An  t-   0.9960  0.9384  67N  24W   3   -   03m37s

✓ Parsed 224 total eclipse records.
  Columns: ['catalog_number', 'date', 'time_greatest_eclipse', 'delta_t', 'lunation_num', 'saros', 'type', 'qle', 'gamma', 'magnitude', 'latitude', 'longitude', 'sun_alt', 'path_width_km', 'central_duration', 'geographic_region']

{'catalog_number': '09511', 'date': '2001 Jun 21', 'time_greatest_eclipse': '12:04:46', 'delta_t': '64', 'lunation_num': 

In [23]:
# ============================================================
# COLUMN MAP — maps standard keys to your table's column names
# ============================================================
# These are pre-filled for NASA SE catalog pages.
# If using a different source, update to match your column names.

COLUMN_MAP = {
    "date":       "date",               # Calendar date of the eclipse
    "type":       "type",               # Eclipse type (T, A, H, P, etc.)
    "location":   "geographic_region",   # Geographic region of visibility
    "magnitude":  "magnitude",           # Eclipse magnitude
    "duration":   "central_duration",    # Central duration (e.g. "04m57s")
    "saros":      "saros",              # Saros series number
}

print("Column mapping set:")
for k, v in COLUMN_MAP.items():
    print(f"  {k:12s} → {v}")

Column mapping set:
  date         → date
  type         → type
  location     → geographic_region
  magnitude    → magnitude
  duration     → central_duration
  saros        → saros


In [25]:
# ============================================================
# BUILD STRUCTURED ECLIPSE LIST + PARSE DATES / LOCATIONS
# ============================================================

# NASA type abbreviation lookup
TYPE_LABELS = {
    "T":  "Total",
    "A":  "Annular",
    "H":  "Hybrid",
    "P":  "Partial",
    "Pb": "Partial",
    "Pe": "Partial",
    "T+": "Total",
    "T-": "Total",
    "Tm": "Total",
    "A+": "Annular",
    "A-": "Annular",
    "Am": "Annular",
    "An": "Annular",
    "As": "Annular",
    "Hm": "Hybrid",
    "H2": "Hybrid",
    "H3": "Hybrid",
}


def parse_year(date_string):
    """Extract a 4-digit year from various date formats using string parsing."""
    try:
        match = re.search(r'(-?\d{4})', date_string)
        if match:
            return int(match.group(1))
    except Exception:
        pass
    return None


def parse_locations(location_string):
    """Use string parsing to pull out location / region names."""
    if not location_string:
        return []
    # Split on common delimiters: commas, semicolons, hyphens, slashes
    parts = re.split(r'[,;/\-]', location_string)
    # Strip whitespace and drop empty strings
    return [part.strip() for part in parts if part.strip()]


def expand_type(abbrev):
    """Expand NASA eclipse type abbreviation to full name."""
    abbrev = abbrev.strip()
    return TYPE_LABELS.get(abbrev, abbrev)


# Build the clean eclipse list
eclipse_list = []

for row in raw_rows:
    eclipse = {}

    # --- Date & Year ---
    raw_date = row.get(COLUMN_MAP["date"], "") if COLUMN_MAP["date"] else ""
    eclipse["date_raw"] = raw_date
    eclipse["year"] = parse_year(raw_date)

    # --- Eclipse Type (expand abbreviation) ---
    raw_type = row.get(COLUMN_MAP["type"], "Unknown") if COLUMN_MAP["type"] else "Unknown"
    eclipse["type_code"] = raw_type.strip()
    eclipse["type"] = expand_type(raw_type)

    # --- Location / Visibility ---
    raw_loc = row.get(COLUMN_MAP["location"], "") if COLUMN_MAP["location"] else ""
    eclipse["location_raw"] = raw_loc
    eclipse["locations"] = parse_locations(raw_loc)

    # --- Magnitude ---
    mag_str = row.get(COLUMN_MAP["magnitude"], "") if COLUMN_MAP["magnitude"] else ""
    try:
        eclipse["magnitude"] = float(mag_str)
    except (ValueError, TypeError):
        eclipse["magnitude"] = None

    # --- Duration ---
    eclipse["duration"] = row.get(COLUMN_MAP["duration"], "") if COLUMN_MAP["duration"] else ""

    # --- Saros ---
    eclipse["saros"] = row.get(COLUMN_MAP["saros"], "") if COLUMN_MAP["saros"] else ""

    # Keep all original columns as well
    eclipse["_raw"] = row

    eclipse_list.append(eclipse)

print(f"✓ Built {len(eclipse_list)} structured eclipse records.\n")
print("Sample records:")
for e in eclipse_list[:5]:
    print(f"  {e['date_raw']:15s}  Type: {e['type']:10s}  Mag: {e['magnitude']}  "
          f"Saros: {e['saros']:4s}  Region: {e['location_raw'][:50]}")

✓ Built 224 structured eclipse records.

Sample records:
  2001 Jun 21      Type: Total       Mag: 1.0495  Saros: 127   Region: 
  2001 Dec 14      Type: Annular     Mag: 0.9681  Saros: 132   Region: 
  2002 Jun 10      Type: Annular     Mag: 0.9962  Saros: 137   Region: 
  2002 Dec 04      Type: Total       Mag: 1.0244  Saros: 142   Region: 
  2003 May 31      Type: Annular     Mag: 0.9384  Saros: 147   Region: 


In [26]:
# ============================================================
# ORGANIZE INTO DICTIONARIES — BY YEAR and BY TYPE
# ============================================================

# --- Dictionary keyed by year ---
eclipses_by_year = {}
for e in eclipse_list:
    yr = e["year"]
    if yr is not None:
        eclipses_by_year.setdefault(yr, []).append(e)

# Sort the years
eclipses_by_year = dict(sorted(eclipses_by_year.items()))

print("=== Eclipses by Year ===")
for year, items in eclipses_by_year.items():
    print(f"  {year}: {len(items)} eclipse(s)")

# --- Dictionary keyed by type ---
eclipses_by_type = {}
for e in eclipse_list:
    etype = e["type"].strip() if e["type"] else "Unknown"
    eclipses_by_type.setdefault(etype, []).append(e)

# Sort types alphabetically
eclipses_by_type = dict(sorted(eclipses_by_type.items()))

print("\n=== Eclipses by Type ===")
for etype, items in eclipses_by_type.items():
    print(f"  {etype}: {len(items)} eclipse(s)")

=== Eclipses by Year ===
  2001: 2 eclipse(s)
  2002: 2 eclipse(s)
  2003: 2 eclipse(s)
  2004: 2 eclipse(s)
  2005: 2 eclipse(s)
  2006: 2 eclipse(s)
  2007: 2 eclipse(s)
  2008: 2 eclipse(s)
  2009: 2 eclipse(s)
  2010: 2 eclipse(s)
  2011: 4 eclipse(s)
  2012: 2 eclipse(s)
  2013: 2 eclipse(s)
  2014: 2 eclipse(s)
  2015: 2 eclipse(s)
  2016: 2 eclipse(s)
  2017: 2 eclipse(s)
  2018: 3 eclipse(s)
  2019: 3 eclipse(s)
  2020: 2 eclipse(s)
  2021: 2 eclipse(s)
  2022: 2 eclipse(s)
  2023: 2 eclipse(s)
  2024: 2 eclipse(s)
  2025: 2 eclipse(s)
  2026: 2 eclipse(s)
  2027: 2 eclipse(s)
  2028: 2 eclipse(s)
  2029: 4 eclipse(s)
  2030: 2 eclipse(s)
  2031: 2 eclipse(s)
  2032: 2 eclipse(s)
  2033: 2 eclipse(s)
  2034: 2 eclipse(s)
  2035: 2 eclipse(s)
  2036: 3 eclipse(s)
  2037: 2 eclipse(s)
  2038: 3 eclipse(s)
  2039: 2 eclipse(s)
  2040: 2 eclipse(s)
  2041: 2 eclipse(s)
  2042: 2 eclipse(s)
  2043: 2 eclipse(s)
  2044: 2 eclipse(s)
  2045: 2 eclipse(s)
  2046: 2 eclipse(s)
  2047: 4

In [27]:
# ============================================================
# EXPORT TO JSON — for main code to pull from
# ============================================================

OUTPUT_FILE = "eclipse_data.json"

export_payload = {
    "source_url": URL,
    "total_eclipses": len(eclipse_list),
    "eclipse_list": eclipse_list,
    "by_year": {str(k): v for k, v in eclipses_by_year.items()},
    "by_type": eclipses_by_type,
}

with open(OUTPUT_FILE, "w") as f:
    json.dump(export_payload, f, indent=2, default=str)

file_size = os.path.getsize(OUTPUT_FILE)
print(f"Saved {OUTPUT_FILE}  ({file_size:,} bytes)")
print(f"  → {export_payload['total_eclipses']} eclipses")
print(f"  → {len(eclipses_by_year)} unique year(s)")
print(f"  → {len(eclipses_by_type)} unique type(s)")
print(f"\nYour main code can now load this with:")
print(f'  with open("{OUTPUT_FILE}") as f:')
print(f'      data = json.load(f)')

Saved eclipse_data.json  (520,727 bytes)
  → 224 eclipses
  → 100 unique year(s)
  → 4 unique type(s)

Your main code can now load this with:
  with open("eclipse_data.json") as f:
      data = json.load(f)
