In [1]:
import ftplib
import os
import tarfile
import pandas as pd

In [2]:
def download_file_from_ftp(ftp_server, ftp_path, local_file):
    ftp = ftplib.FTP(ftp_server)
    ftp.login()
    
    ftp.cwd(os.path.dirname(ftp_path))
    
    with open(local_file, "wb") as f:
        ftp.retrbinary(f"RETR " + os.path.basename(ftp_path), f.write)

    ftp.quit()
    print(f"Downloaded {local_file}")

def extract_txt_files(tar_file, extract_to_path):
    if tarfile.is_tarfile(tar_file):
        with tarfile.open(tar_file) as tar:
            tar.extractall(path=extract_to_path)
            print(f"Extracted all files to {extract_to_path}")
            
            txt_files = [member.name for member in tar.getmembers() if member.isfile() and member.name.endswith(".txt")]
            return txt_files
    else:
        print(f"{tar_file} is not a valid tar.gz file")
        return []

In [13]:
def pattern1(txt_files,extract_to_path, year):
    # for year 2010-2024 
    if txt_files:
        start_time = []
        peak_time = []
        end_time = []
        cls = []
        noaa_ar = []
        obs = []
        for txt_file in txt_files:
            # read txt file
            if txt_file == 'readme.txt':
                continue
            if year == 2025:
                path = os.path.join(extract_to_path, '2025_events', txt_file)
            else:
                path = os.path.join(extract_to_path, txt_file)
            with open(path) as f:
                data = f.readlines()
                # get flare events info
                date = data[2][7:-1].replace(' ', '/')
                if data[13:][0] == 'NO EVENT REPORTS.\n' :
                    continue
                for line in data[13:]:
                # skip the new line character
                    if line == '\n':
                        continue
                    # split the line by space
                    line = line.replace('+', '').split()
                    if len(line) < 9:
                        continue
                    # if obs not start with G, skip the line
                    if not line[4].startswith('G'):
                        continue
                    start = date + ' ' + line[1][:2] + ':' + line[1][2:]
                    peak = date + ' ' + line[2][:2] + ':' + line[2][2:]
                    end = date + ' ' + line[3][:2] + ':' + line[3][2:]
                    start_time.append(start)
                    peak_time.append(peak)
                    end_time.append(end)
                    cls.append(line[8])
                    obs.append(line[4])
                    if len(line) == 11:
                        noaa_ar.append('1'+line[-1])
                    else:
                        noaa_ar.append(0)
        # save to csv
        df = pd.DataFrame({'start_time': start_time, 'peak_time': peak_time, 'end_time': end_time, 'class': cls, 'noaa_ar': noaa_ar, 'obs': obs})
        # reordering by start_time
        #df['start_time'] = pd.to_datetime(df['start_time'])
        #df = df.sort_values(by='start_time')
        df.to_csv(f'ftp_flares_{year}.csv', index=False)
        print(f"Save ftp_flares_{year}.csv")
    
def get_ftp_files(year):
    # this function get flare events info by year from ftp://ftp.swpc.noaa.gov/pub/warehouse/
    # return a df and save a csv file
    ftp_server = "ftp.swpc.noaa.gov"
    ftp_path = f"/pub/warehouse/{year}/{year}_events.tar.gz"
    local_file = f"{year}_events.tar.gz"
    extract_to_path = f"./extracted_txt_files_{year}"

    
    #if year == 2024:
        #txt_files = download_2024()
        #pattern1(txt_files, extract_to_path, year)
    if year in [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 ,2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
        download_file_from_ftp(ftp_server, ftp_path, local_file)
        txt_files = extract_txt_files(local_file, extract_to_path)
        pattern1(txt_files, extract_to_path, year)
    if year in [1996, 1997, 1998, 1999]:
        download_file_from_ftp(ftp_server, ftp_path, local_file)
        txt_files = extract_txt_files(local_file, extract_to_path)
        pattern2(txt_files, extract_to_path, year)
    

In [14]:
get_ftp_files(2024)

Downloaded 2024_events.tar.gz
Extracted all files to ./extracted_txt_files_2024


  tar.extractall(path=extract_to_path)


Save ftp_flares_2024.csv


In [10]:
from datetime import datetime
import re
from pathlib import Path

def download_ftp_flares(
    ftp_server="ftp.swpc.noaa.gov",
    ftp_path="/pub/warehouse/2025/2025_events",
    local_dir="./extracted_txt_files_2025/2025_events",
    cutoff_date="2025-10-31"
):
    """
    Download SWPC event text files newer than cutoff_date (YYYY-MM-DD).
    Example file name pattern: 20251031events.txt
    """
    os.makedirs(local_dir, exist_ok=True)
    cutoff_dt = datetime.strptime(cutoff_date, "%Y-%m-%d")

    ftp = ftplib.FTP(ftp_server)
    ftp.login()
    ftp.cwd(ftp_path)

    files = ftp.nlst()
    pattern = re.compile(r"(\d{8})events\.txt", re.IGNORECASE)

    downloaded = []
    for file in files:
        m = pattern.match(file)
        if not m:
            continue

        file_date = datetime.strptime(m.group(1), "%Y%m%d")
        if file_date <= cutoff_dt:
            continue  # skip older files

        local_file = os.path.join(local_dir, file)
        if os.path.exists(local_file):
            print(f"Skip existing: {file}")
            continue

        print(f"Downloading {file} ...")
        with open(local_file, "wb") as f:
            ftp.retrbinary(f"RETR {file}", f.write)
        downloaded.append(file)

    ftp.quit()
    print(f"Downloaded {len(downloaded)} new files.")
    return True

def list_swpc_txt_files(txt_root, year=2025, cutoff_yyyymmdd="20251031"):
    """
    Return a list of txt filenames (not paths) under `txt_root` to parse.
    Keeps only files with names like YYYYMMDDevents.txt and later than cutoff.
    """
    txt_root = Path(txt_root)
    patt = re.compile(r"(\d{8})events\.txt$", re.IGNORECASE)

    keep = []
    for f in sorted(txt_root.glob("*.txt")):
        if f.name.lower() == "readme.txt":
            continue
        m = patt.match(f.name)
        if not m:
            continue
        if m.group(1) > cutoff_yyyymmdd:
            keep.append(f.name)
    return keep

def extract_xrs_to_csv(txt_files, extract_to_path, year=2025, out_csv="ftp_flares_2025.csv"):
    """
    Read the given daily 'events' txt files and extract GOES/XRS (flare) rows.
    Overwrites `out_csv` on each run.

    Parameters
    ----------
    txt_files : list[str]
        Filenames (not paths) returned by `list_swpc_txt_files(...)`.
    extract_to_path : str or Path
        Parent folder containing the daily txt files.
        For year 2025 we expect files in {extract_to_path}/2025_events/.
    year : int
        Year of these daily files (used only for building the subfolder name).
    out_csv : str
        Output CSV path; will be overwritten.
    """
    extract_to_path = Path(extract_to_path)
    start_time, peak_time, end_time = [], [], []
    cls, noaa_ar, obs = [], [], []

    if not txt_files:
        # nothing to parse; write empty CSV with headers for reproducibility
        pd.DataFrame(columns=["start_time","peak_time","end_time","class","noaa_ar","obs"]).to_csv(out_csv, index=False)
        print(f"Saved (empty) {out_csv}")
        return out_csv

    for txt_file in txt_files:
        if txt_file.lower() == "readme.txt":
            continue

        # daily file lives in {extract_to_path}/2025_events/ or just extract_to_path/
        if year == 2025:
            path = extract_to_path / "2025_events" / txt_file
        else:
            path = extract_to_path / txt_file

        if not path.exists():
            # if the files were saved directly under extract_to_path, try that
            alt = extract_to_path / txt_file
            if alt.exists():
                path = alt
            else:
                print(f"Skip missing: {path}")
                continue

        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()

        if len(lines) < 14:
            continue

        raw_date = lines[2].strip()
        # Extract YYYY.MM.DD (or variants) and normalize to YYYY/MM/DD
        date_part = (
            raw_date.replace("Event Date:", "")
                    .replace("-", ".").replace("/", ".").replace(" ", ".")
                    .strip(".")
        )
        parts = [p for p in date_part.split(".") if p.isdigit()]
        if len(parts) >= 3:
            date_str = f"{parts[0]}/{parts[1]}/{parts[2]}"  # YYYY/MM/DD
        else:
            # fallback: skip file if date header is not as expected
            continue

        # If the table says "NO EVENT REPORTS." then skip
        rest = lines[13:]
        if not rest or rest[0].strip().upper().startswith("NO EVENT REPORTS"):
            continue

        for line in rest:
            if not line.strip():
                continue
            # remove '+' signs, split on whitespace
            tokens = line.replace("+", "").split()
            if len(tokens) < 9:
                continue

            # tokens[4] should be obs, and we only keep GOES rows (starting with 'G')
            if not tokens[4].startswith("G"):
                continue

            # times are HHMM in tokens[1], [2], [3]
            try:
                s_hhmm = tokens[1]; p_hhmm = tokens[2]; e_hhmm = tokens[3]
                s = f"{date_str} {s_hhmm[:2]}:{s_hhmm[2:]:0>2}"
                p = f"{date_str} {p_hhmm[:2]}:{p_hhmm[2:]:0>2}"
                e = f"{date_str} {e_hhmm[:2]}:{e_hhmm[2:]:0>2}"
            except Exception:
                continue

            start_time.append(s)
            peak_time.append(p)
            end_time.append(e)

            # class at tokens[8]
            cls.append(tokens[8])
            obs.append(tokens[4])

            # NOAA AR present when len == 11 â†’ last token; else 0
            ar = tokens[-1] if len(tokens) == 11 and tokens[-1].isdigit() else "0"
            if ar == "0":
                noaa_ar.append("0")
            else:
                noaa_ar.append('1'+ar)

    df = pd.DataFrame({
        "start_time": start_time,
        "peak_time":  peak_time,
        "end_time":   end_time,
        "class":      cls,
        "noaa_ar":    noaa_ar,
        "obs":        obs
    })

    # sort by start_time 
    # df["start_time"] = pd.to_datetime(df["start_time"])
    # df = df.sort_values("start_time").reset_index(drop=True)

    # Overwrite output CSV on each run
    Path(out_csv).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_csv, index=False)
    print(f"Saved {out_csv} with {len(df)} XRS flare rows.")
    return out_csv

In [11]:
ftp_server="ftp.swpc.noaa.gov"
ftp_path="/pub/warehouse/2025/2025_events"
local_dir="D:\\2024_S1\\ML_SEP_2402\\swpc_ftp\\extracted_txt_files_2025" # Local directory to save downloaded files
cutoff_date="2024-12-30" 
flare_csv = "D:\\2024_S1\\ML_SEP_2402\\swpc_ftp\\ftp_flares_2025.csv" # Output CSV file for flares

In [12]:
download_ftp_flares(ftp_server, ftp_path, local_dir, cutoff_date)
files_to_parse = list_swpc_txt_files(local_dir, year=2025, cutoff_yyyymmdd=re.sub(r"-", "", cutoff_date))
extract_xrs_to_csv(files_to_parse, extract_to_path=local_dir, year=2025, out_csv=flare_csv)

Downloading 20250216events.txt ...
Downloading 20241231events.txt ...
Downloading 20250217events.txt ...
Downloading 20250228events.txt ...
Downloading 20250218events.txt ...
Downloading 20250101events.txt ...
Downloading 20250219events.txt ...
Downloading 20250102events.txt ...
Downloading 20250220events.txt ...
Downloading 20250103events.txt ...
Downloading 20250221events.txt ...
Downloading 20250104events.txt ...
Downloading 20250301events.txt ...
Downloading 20250105events.txt ...
Downloading 20250302events.txt ...
Downloading 20250106events.txt ...
Downloading 20250303events.txt ...
Downloading 20250107events.txt ...
Downloading 20250304events.txt ...
Downloading 20250108events.txt ...
Downloading 20250305events.txt ...
Downloading 20250109events.txt ...
Downloading 20250306events.txt ...
Downloading 20250110events.txt ...
Downloading 20250307events.txt ...
Downloading 20250111events.txt ...
Downloading 20250308events.txt ...
Downloading 20250112events.txt ...
Downloading 20250309

'D:\\2024_S1\\ML_SEP_2402\\swpc_ftp\\ftp_flares_2025.csv'

In [None]:
ftp24 = pd.read_csv("ftp_flares_2024.csv")

In [16]:
ftp24['noaa_ar'] = ftp24['noaa_ar'].apply(lambda x: "1"+str(x) if str(x) != '0' else '0')

In [18]:
ftp25 = pd.read_csv("D:\\2024_S1\\ML_SEP_2402\\swpc_ftp\\ftp_flares_2025.csv")

In [23]:
ftp_updated = pd.concat([ftp24, ftp25], ignore_index=True)
ftp_updated.drop_duplicates(inplace=True)

In [26]:
ftp_updated.rename(columns={'class': 'label'}, inplace=True)

In [28]:
cls = ftp_updated['label'].apply(lambda x: str(x)[0])

In [29]:
ftp_updated['cls'] = cls

In [19]:
ftp_old = pd.read_csv("D:\\2024_S1\\ML_SEP_2402\\swpc_ftp\\v2_ftp_flares_1997_2024.csv")

In [21]:
# keep 2010-2023
ftp_old = ftp_old[ftp_old['start_time'].str.startswith(('2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023'))]

In [25]:
ftp_old = ftp_old.drop(columns=['unique_id'])

In [31]:
ftp_new = pd.concat([ftp_old, ftp_updated], ignore_index=True)
ftp_new.drop_duplicates(inplace=True)

In [33]:
# reordering by start_time
remove = []
for i in range(ftp_new.shape[0]):
    try:
        pd.to_datetime(ftp_new['start_time'][i])
    except:
        remove.append(i)
        print(ftp_new.iloc[i])

start_time    2024/06/01 A1:903
peak_time      2024/06/01 19:39
end_time      2024/06/01 A2:028
label                      M7.3
noaa_ar                   13697
obs                         G16
cls                           M
Name: 25413, dtype: object
start_time    2024/10/21 B0:840
peak_time     2024/10/21 U1:001
end_time      2024/10/21 B1:139
label                      C6.2
noaa_ar                       0
obs                         G16
cls                           C
Name: 26823, dtype: object
start_time    2024/11/15 B0:138
peak_time     2024/11/15 U0:146
end_time      2024/11/15 B0:208
label                      M1.1
noaa_ar                   13893
obs                         G16
cls                           M
Name: 27055, dtype: object


In [34]:
# remove the rows with invalid start_time
ftp_new = ftp_new.drop(remove, axis=0)
ftp_new = ftp_new.sort_values(by='start_time').reset_index(drop=True)

In [38]:
# delete the ftp rows where invalid class
remove = []
for i in range(ftp_new['label'].shape[0]):
    if ftp_new['label'].iloc[i][0] not in ['X', 'M', 'C','B','A']:
        remove.append(i)
        print(ftp_new['label'].iloc[i], ftp_new['start_time'].iloc[i])

In [41]:
ftp_new.to_csv("ftp_flares_20100101_20251115.csv", index=False)