### NOAA Solar Proton Event Scraper and Formatter

This section defines a complete pipeline to extract solar proton event data from NOAA’s online HTML page, parse complex date and flare formats, and upload cleaned results to an AWS S3 bucket. It is designed for daily automated updates by simplifying the timestamp format to include only the current date (no hour/minute), making the process repeatable and user-friendly.

---

#### 1. Date Conversion Utility (`convert_to_iso8601`)
- Converts various NOAA date string formats into ISO 8601 format (`YYYY-MM-DDTHH:MM:SS`).
- Handles edge cases like:
  - Missing time components
  - Misaligned day/month formats
  - Crammed time strings
- Returns the original value if parsing fails.

---

#### 2. Flare Information Parser (`parse_flare_max`)
- Parses inconsistent flare maximum strings like:
  - `"M4/1B 09/1858"` or `"X1 10/25 0816"`
- Extracts:
  - `Flare Class` (e.g., "M4")
  - `Optical Class` (e.g., "1B")
  - `Flare Peak Time (UTC)` as ISO 8601
- If the flare info is missing, marked as “farside”, or “N/A”, it leaves the values blank or sets them as “N/A”.

---

#### 3. Main Scraper Function (`update_solar_proton_event_list`)
- **Scrapes the NOAA SEP page** using `requests` and `BeautifulSoup`.
- Parses the HTML table containing solar proton events.
- Enhances the table by:
  - Inserting three new columns: `Flare Class`, `Optical Class`, and `Flare Peak Time (UTC)`
  - Parsing dates and flare entries for standardization
- Compiles all records into a **pandas DataFrame**.

---

#### 4. Upload to AWS S3
- Converts the DataFrame to both CSV and HTML formats.
- Uses the current **UTC date only** (`%Y-%m-%d`) in the filename, allowing easy overwriting or updating per day.
- Uploads both files to a specified S3 bucket using `boto3`.
- Provides direct, timestamped links for external access.

---

This script enables a consistent and daily-refreshable solar proton event catalog directly from NOAA’s public dataset — ready for scientific processing or web-based visualization.



In [3]:
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import pandas as pd
import boto3
import io

# -------------------------------
#  Helper Function 1: Date Parser
# -------------------------------
def convert_to_iso8601(date_str):
    try:
        date_str = date_str.strip().replace("  ", " ")
        parts = date_str.split()

        if len(parts) == 2:
            year, rest = parts[0], parts[1]
            if '/' in rest and len(rest.split('/')) == 3:
                month, day, time_str = rest.split('/')
            elif '/' in rest and len(rest.split('/')) == 2:
                month, day = rest.split('/')
                time_str = "0000"
            else:
                return date_str.strip()

        elif len(parts) == 3:
            year = parts[0]
            month, day = parts[1].split('/')
            time_str = parts[2]
        else:
            return date_str.strip()

        time_str = time_str.zfill(4)
        dt = datetime.strptime(f"{year}-{month}-{day} {time_str}", "%Y-%m-%d %H%M")
        return dt.strftime("%Y-%m-%dT%H:%M:%S")

    except Exception as e:
        print(f" Skipping invalid date: {date_str} → {e}")
        return date_str.strip()

# -------------------------------
#  Helper Function 2: Flare Parser
# -------------------------------

def parse_flare_max(flare_raw, year="2025", default_month=None):
    flare_class = ""
    optical_class = "N/A"
    flare_iso = ""

    try:
        parts = flare_raw.strip().split()

        # Handle crammed 2-part input like "M4/1B 09/1858"
        if len(parts) == 2:
            flare_type = parts[0]
            date_time_str = parts[1]

            # Parse flare class and optical class
            if "/" in flare_type:
                flare_class, optical_class = flare_type.split("/", 1)
            else:
                flare_class = flare_type

            # Handle crammed MM/DDHHMM or DD/HHMM (with fallback)
            if "/" in date_time_str:
                date_part, time_part = date_time_str.split("/")

                if default_month is not None:
                    # Assume date_part is day and default_month is month
                    month = str(default_month).zfill(2)
                    day = date_part.zfill(2)
                else:
                    # Attempt to treat date_part as month
                    if len(date_part) == 2 and len(time_part) == 4:
                        month = date_part.zfill(2)
                        day = time_part[:2].zfill(2)
                        time_str = time_part[2:].zfill(2) + "00"
                    else:
                        raise ValueError("Month missing and no default_month provided")

                # Time string handling if not already set
                if 'time_str' not in locals():
                    if len(time_part) == 4:
                        time_str = time_part.zfill(4)
                    elif len(time_part) == 2:
                        time_str = "0000"
                    else:
                        raise ValueError("Unexpected time_part format")
            else:
                raise ValueError("Missing slash in crammed date/time")

        # Handle clean 3-part format: flare MM/DD HHMM
        elif len(parts) == 3:
            flare_type, date_part, time_part = parts
            if "/" in flare_type:
                flare_class, optical_class = flare_type.split("/", 1)
            else:
                flare_class = flare_type

            if "/" in date_part:
                month, day = [s.zfill(2) for s in date_part.split("/")]
            else:
                month, day = date_part[:2], date_part[2:]
            time_str = time_part.zfill(4)

        else:
            raise ValueError("Invalid flare format")

        # Final ISO timestamp
        dt = datetime.strptime(f"{year}-{month}-{day} {time_str}", "%Y-%m-%d %H%M")
        flare_iso = dt.strftime("%Y-%m-%dT%H:%M:%S")

    except Exception as e:
        print(f" Error parsing: {flare_raw} → {e}")

    return flare_class, optical_class, flare_iso

# -------------------------------
#  Main Function: NOAA Proton List to S3
# -------------------------------
def update_solar_proton_event_list(bucket="helioconverter-web-application", s3_prefix="sep_catalogs/"):
    url = "https://www.ngdc.noaa.gov/stp/space-weather/interplanetary-data/solar-proton-events/SEP%20page%20code.html"
    response = requests.get(url)
    #soup = BeautifulSoup(response.text, "html.parser")
    soup = BeautifulSoup(response.text, "html5lib")
    table = soup.find("table")

    raw_headers = [th.get_text(strip=True) for th in table.find("thead").find_all("th")]
    headers = raw_headers.copy()
    headers.remove("Flare Maximum (UTC)")
    headers.insert(5, "Flare Class")
    headers.insert(6, "Optical Class")
    headers.insert(7, "Flare Peak Time (UTC)")

    rows = []
    for tr in table.find("tbody").find_all("tr"):
        tds = tr.find_all("td")
        
        # if len(tds) != len(raw_headers):
        #    continue
        if len(tds) < 6:  # must at least include enough to extract flare time (index 5)
            print(f" Skipping broken row with only {len(tds)} cells")
            continue


        values = [td.get_text(separator=" ", strip=True).replace("\n", " ") for td in tds]
        # Pad to raw header length
        while len(values) < len(raw_headers):
            values.append("N/A")
        values[0] = convert_to_iso8601(values[0])
        values[1] = convert_to_iso8601(values[1])
        
        # Extract month from event start time
        try:
            default_month = datetime.strptime(values[0], "%Y-%m-%dT%H:%M:%S").month
        except Exception:
            default_month = None
        
        # flare_class, optical_class, flare_iso = parse_flare_max(values[5], year=values[0][:4])
        
        #flare_class, optical_class, flare_iso = parse_flare_max(values[5], year=values[0][:4], default_month=default_month)
        
        if values[5] in ["N/A", "farside event", ""]:
            flare_class, optical_class, flare_iso = "N/A", "N/A", ""
        else:
            flare_class, optical_class, flare_iso = parse_flare_max(values[5], year=values[0][:4], default_month=default_month)

        values.pop(5)
        values.insert(5, flare_class)
        values.insert(6, optical_class)
        values.insert(7, flare_iso)
        rows.append(values)

    df = pd.DataFrame(rows, columns=headers)

    csv_buffer = io.StringIO()
    html_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    df.to_html(html_buffer, index=False, escape=False)

    s3 = boto3.client("s3")
    timestamp = datetime.utcnow().strftime("%Y-%m-%d")
    csv_key = f"{s3_prefix}{timestamp}_solar_proton_events.csv"
    html_key = f"{s3_prefix}{timestamp}_solar_proton_events.html"

    s3.put_object(Bucket=bucket, Key=csv_key, Body=csv_buffer.getvalue())
    s3.put_object(Bucket=bucket, Key=html_key, Body=html_buffer.getvalue(), ContentType='text/html')

    print(f" Solar proton event list uploaded to S3: {csv_key} and {html_key}")
    print(f" CSV: https://{bucket}.s3.amazonaws.com/{csv_key}")
    print(f" HTML: https://{bucket}.s3.amazonaws.com/{html_key}")

In [None]:
update_solar_proton_event_list()