In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# get all the urls
url = "https://www.lmsal.com/solarsoft/latest_events_archive.html"
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

table = soup.find("table")

urls = []
contents = table.find("tr").find_all("a")
for content in contents:
    day_url = content.get("href")
    urls.append(day_url)

    

In [None]:
columns = ["event_num", "event_name", "start_time", "end_time", "peak_time", "label", "Derived Position"]
ssw = pd.DataFrame(columns=columns)

for url in urls:
    url = "https://www.lmsal.com/solarsoft/" + url
    
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        tables = soup.find_all("table")
        last_table = tables[-1]

        rows = last_table.find_all("tr")
        data = []
        for row in rows:
            cells = row.find_all("td")[:7]
            if cells:
                data.append([cell.text.strip() for cell in cells])
        ssw = pd.concat([ssw, pd.DataFrame(data, columns=columns)], ignore_index=True)
    except Exception as e:
        print(url, e)

In [21]:
ssw = ssw.drop_duplicates().reset_index(drop=True)

In [23]:
ssw['end_time'] = ssw['start_time'].apply(lambda x: x[:10]) + " " + ssw['end_time']
ssw['peak_time'] = ssw['start_time'].apply(lambda x: x[:10]) + " " + ssw['peak_time']

ssw['start_time'] = pd.to_datetime(ssw['start_time'])
ssw['end_time'] = pd.to_datetime(ssw['end_time'])
ssw['peak_time'] = pd.to_datetime(ssw['peak_time'])

# if peak/end time < start time, then peak/end time += 1 day
for i in range(ssw.shape[0]):
    if ssw.iloc[i]['peak_time'] < ssw.iloc[i]['start_time']:
        ssw.loc[i, 'peak_time'] = pd.to_datetime(ssw.iloc[i]['peak_time']) + pd.DateOffset(days=1)
    if ssw.iloc[i]['end_time'] < ssw.iloc[i]['start_time']:
        ssw.loc[i, 'end_time'] = pd.to_datetime(ssw.iloc[i]['end_time']) + pd.DateOffset(days=1)

In [None]:
import re

def extract_content_in_parentheses(input_string):
    match = re.search(r'\((.*?)\)', input_string)
    if match:
        return match.group(1)
    return None

In [32]:
noaa_ar = []
for i in range(ssw.shape[0]):
    try:
        noaa_ar.append(extract_content_in_parentheses(ssw.iloc[i]['Derived Position']))
    except:
        print(i)
        print(ssw.iloc[i]['Derived Position'])
        noaa_ar.append(None)

In [33]:
# clean the blank space in noaa_ar
noaa_ar = pd.Series(noaa_ar).apply(lambda x: x if x is not None else '')
noaa_ar = noaa_ar.str.strip()
noaa_ar = noaa_ar.apply(lambda x: '1'+x)

In [35]:
ssw['noaa_ar'] = noaa_ar

In [36]:
ssw['cls'] = ssw['label'].str[0]

In [37]:
# fill noaa_ar nan with 0, to align with other data sets
ssw['noaa_ar'] = ssw['noaa_ar'].fillna(0)

In [38]:
ssw.drop(columns=['event_num'], inplace=True)

In [42]:
# reorder by start time
ssw = ssw.sort_values(by='start_time').reset_index(drop=True)

In [44]:
def to_stonyhurst(location):
    # Extract latitude and longitude from the string
    lat_dir = location[0]  # 'S' or 'N'
    lon_dir = location[3]  # 'W' or 'E'

    # Parse latitude and longitude values
    latitude = int(location[1:3])  # Latitude value (e.g., 18)
    longitude = int(location[4:])  # Longitude value (e.g., 77)

    # Convert based on direction
    if lat_dir == 'S':
        latitude = -latitude  # South is negative
    elif lat_dir != 'N':
        raise ValueError("Latitude direction must be 'N' or 'S'.")

    if lon_dir == 'E':
        longitude = -longitude  # East is negative
    elif lon_dir != 'W':
        raise ValueError("Longitude direction must be 'E' or 'W'.")

    return latitude, longitude

In [47]:
ssw['ssw_location'] = ssw['Derived Position'].apply(lambda x: x.split(' ')[0])

In [48]:
lats = []
for i in range(ssw.shape[0]):
    try:
        lat = to_stonyhurst(ssw['ssw_location'][i])[0]
        lats.append(lat)
    except:
        lats.append(None)
        print(i)
        print(ssw['ssw_location'][i])

In [49]:
ssw['ssw_latitudes'] = lats

In [51]:
longs = []
for i in range(ssw.shape[0]):
    try:
        long = to_stonyhurst(ssw['ssw_location'][i])[1]
        longs.append(long)
    except:
        print(i)
        print(ssw['ssw_location'][i])
        longs.append(None)

In [52]:
ssw['ssw_longitude'] = longs

In [None]:
#ssw.to_csv("ssw_20100101_20251115.csv", index=False)