In [None]:
# ============================================
# LIBRARIES & DIRECTORIES
# =============================================
import os
import sys
import pandas as pd
import numpy as np

PROCESSED_DIR = os.path.join("..", "data", "processed")

In [11]:
# ============================================
# LOADS LABELED DATA
# =============================================
df = pd.read_csv(os.path.join(PROCESSED_DIR, "flux_with_labels.csv"))
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time").reset_index(drop=True)

print(df.shape)

(4307040, 14)


In [12]:
# ============================================
# COMPUTES AND ADDS NEW FEATURES (FEATURES EXPLAINED IN PRINT STATEMENTS)
# =============================================

print("Adding log-transformed flux column... ", end = "")
df["xrsb_flux_log"] = np.log10(df["xrsb_flux"].replace(0, np.nan)).fillna(-10)
print("Done.")

print("Adding 5 minute rolling difference (the difference in flux from 5 minutes ago)...", end = "")
df["flux_diff_5min"] = df["xrsb_flux"].diff(periods=5)
print("Done.")

print("Adding flux rate of change (gradient)...", end ="")
df["flux_rate"] = df["xrsb_flux"].diff().fillna(0)
print("Done.")

print("Adding gradient change of flux (second derivative of flux curve)...", end = "")
df["flux_acceleration"] = df["flux_rate"].diff().fillna(0)
print("Done.")

print("Adding 5 minute rolling maximum window (the max jump in flux over the last 5 minutes)...", end = "")
df["flux_max_diff_5min"] = df["xrsb_flux"].rolling(window=5).apply(lambda x: x.max() - x.min(), raw=True)
print("Done.")

print("Adding mean, min, max, and standard deviation over last 5/15/30/60 minutes...", end = "")
for window in [5, 15, 30, 60]:
    df[f"rolling_avg_{window}min"] = df["xrsb_flux"].rolling(window=window, min_periods=1).mean()
    df[f"rolling_min_{window}min"] = df["xrsb_flux"].rolling(window=window, min_periods=1).min()
    df[f"rolling_max_{window}min"] = df["xrsb_flux"].rolling(window=window, min_periods=1).max()
    df[f"rolling_std_{window}min"] = df["xrsb_flux"].rolling(window=window, min_periods=1).std()
print("Done.")

def rolling_slope(series, window=30):
    slopes = [np.nan] * window
    total = len(series)
    for i in range(window, total):
        y = series[i - window:i]
        x = np.arange(window)
        slope = np.polyfit(x, y, 1)[0] if not np.any(np.isnan(y)) else np.nan
        slopes.append(slope)

        # Progress indicator!
        if i % 10000 == 0 or i == total - 1: # Only prints one every 10,000 and when finished
            print(f"\rAdding rolling slope (slope of flux over last 30 minutes): {i}/{total} ({(i/total)*100:.2f}%)...", end="")

    print("Done.")
    return pd.Series(slopes, index=series.index)
df["rolling_slope_30min"] = rolling_slope(df["xrsb_flux"], window=30)

print("Adding flux above background flag (binary, if flux is above NOAA-defined background threshold)...", end = "")
df["flux_above_background"] = (df["xrsb_flux"] > 1e-6).astype(int)
print("Done.")

print("Adding flux rising (binary) & streak (how many minutes in a row flux has been rising)...", end = "")
df["flux_rising"] = (df["xrsb_flux"].diff() > 0).astype(int)
df["rising_streak"] = df["flux_rising"] * (
    df["flux_rising"].groupby((df["flux_rising"] != df["flux_rising"].shift()).cumsum()).cumcount() + 1
)
print("Done.")

Adding log-transformed flux column... Done.
Adding 5 minute rolling difference (the difference in flux from 5 minutes ago)...Done.
Adding flux rate of change (gradient)...Done.
Adding gradient change of flux (second derivative of flux curve)...Done.
Adding 5 minute rolling maximum window (the max jump in flux over the last 5 minutes)...Done.
Adding mean, min, max, and standard deviation over last 5/15/30/60 minutes...Done.
Adding rolling slope (slope of flux over last 30 minutes): 4307039/4307040 (100.00%)...Done.
Adding flux above background flag (binary, if flux is above NOAA-defined background threshold)...Done.
Adding flux rising (binary) & streak (how many minutes in a row flux has been rising)...Done.


In [None]:
# ============================================
# ADDS LAG FEATURES AND FUTURE TARGETS
# ============================================

import numpy as np

# Adding lag features for time-series data
print("Adding selected lag features only...")

selected_lags = [1, 2, 3, 5, 10, 15, 30, 60, 90, 120, 180, 240, 300]
for lag in selected_lags:
    df[f"xrsb_flux_lag_{lag}"] = df["xrsb_flux"].shift(lag).astype(np.float32)
print(f"Done.")

print(f"Generating future flare flags for 6h, 12h, 18h, and 24h...")
# Creates a bianry array to make future processing (way) faster
df = df.sort_values("time").reset_index(drop=True)
flare = df["binary_label"].values[::-1]  # Reverse to be able to apply a .rolling operation into the future instead of the past

# Defines time windows (in minutes)
future_windows = {
    "6h": 6 * 60,
    "12h": 12 * 60,
    "18h": 18 * 60,
    "24h": 24 * 60
}

# Computes rolling max and assign as new columns
for label, window in future_windows.items():
    print(f" - flare_in_next_{label} (window={window} min)")
    future_flare = (
        pd.Series(flare)
        .rolling(window=window, min_periods=1) # slides the window
        .max() # checks to see if any 1s (flares) in the window
        .fillna(0) # edge cases at the beginnings and ends of series
        .astype(int) # won't work without this (idk why)
        .values[::-1] # flips data back to forward time
    )
    df[f"flare_in_next_{label}"] = future_flare

print("Done.")

# Checks the balance of each class
print("Class distributions:")
print(df["flare_in_next_6h"].value_counts().sort_index())
print(df["flare_in_next_12h"].value_counts().sort_index())
print(df["flare_in_next_18h"].value_counts().sort_index())
print(df["flare_in_next_24h"].value_counts().sort_index())

Adding selected lag features only...
Done. 13 lag features created.
Adding future flux values (5–30 min ahead)...Done.
Generating future flare flags for 6h, 12h, 18h, and 24h...
  ➤ flare_in_next_6h (window=360 min)
  ➤ flare_in_next_12h (window=720 min)
  ➤ flare_in_next_18h (window=1080 min)
  ➤ flare_in_next_24h (window=1440 min)
Done.
Class distributions:
flare_in_next_6h
0    2436031
1    1871009
Name: count, dtype: int64
flare_in_next_12h
1    2256145
0    2050895
Name: count, dtype: int64
flare_in_next_18h
1    2457427
0    1849613
Name: count, dtype: int64
flare_in_next_24h
1    2605509
0    1701531
Name: count, dtype: int64


In [16]:
# ============================================
# SAVES DATA
# =============================================
FEATURE_DIR = os.path.join("..", "data", "processed")
os.makedirs(FEATURE_DIR, exist_ok=True)

df.to_csv(os.path.join(FEATURE_DIR, "flux_with_features.csv"), index=False)
print("Saved features to flux_with_features.csv")

Saved features to flux_with_features.csv
