**Purpose:** Perform Feature Extraction using TSFRESH for Kepler Exoplanet Lightcurves

**Goals:**
1. Filter Flux Observations to remove outliers
2. De-Trend the lightcurves using polynomial smoothing (Savitsky-Golay Filter)
3. Normalize the ligtcurves
4. Extract Features using the TSFRESH library

**Input:** Exoplanet Lightcurves chosen by availability and data quality metrics

**Output:** CSV File containing a list of Time Series Features for use by AI/ML Models.

In [None]:
###################################################################################
# Feature Extraction using TSFRESH for Kepler Exoplanet Lightcurves
###################################################################################
!pip install lightkurve astroquery pandas numpy tqdm  xgboost
!bash pip install --upgrade astroquery
#!pip install tsfel
!pip install stumpy
!pip install tsfresh
!pip install batman-package


In [None]:
import os
import pandas as pd
import numpy as np
import tsfresh
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters
#import tsfel
#from tsfel import time_series_features_extractor, get_features_by_domain
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from pathlib import Path
import lightkurve as lk
from lightkurve import LightCurve
#import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import stumpy
from multiprocessing import Process, Queue
import time



#import lightkurve as lk
#from lightkurve import LightCurve



In [None]:
#Define paths and constants

ROOT  = Path("/content/drive/MyDrive/Berkeley_AIML/Capstone/lightcurves")
#EXOPLANET_INPUT_CURVES_PATH = ROOT / "chosen_exoplanet_curves"
SIMULATED_CURVES_PATH       = ROOT / "simcurves2"
ANCILLARY                   = ROOT / "ancillary"
OUTPUT_PATH                 = ROOT / "orbital_prediction_simcurve_XGB_model1"
CHOSEN_EXOPLANET_CURVES     = ROOT / "chosen_exoplanet_curves"
CHOSEN_EXOPLANET_CLEAN_CURVES     = ROOT / "chosen_exoplanet_clean_curves"

# Kepler long cadence parameters for number of observations per quarter
CADENCE_MIN = 29.4  # minutes
CADENCE_SEC = CADENCE_MIN * 60  # seconds
DAYS_PER_QUARTER = 90
POINTS_PER_QUARTER = int(DAYS_PER_QUARTER * 24 * 60 / CADENCE_MIN)  # ~4,416
NUM_QUARTERS = 4
TOTAL_POINTS = POINTS_PER_QUARTER * NUM_QUARTERS  # ~17,664
# Array of time values for 4 quarters of 90 days each at a cadence of 29.4 minutes approx.
TIME_ARRAY = np.linspace(0, DAYS_PER_QUARTER * NUM_QUARTERS, TOTAL_POINTS)


# Parameters of Log-normal distribution of ORBITAL PERIOD (from EDA_ZERO.ipynb) fitted to known Kepler long cadence lightcurves with exoplanets
ORB_PERIOD_LOGNORM_SHAPE = 1.3330          # sigma
ORB_PERIOD_LOGNORM_LOC = 0.0               # location, must be 0
ORB_PERIOD_LOGNORM_SCALE = 11.5893         # exp(mu)

# Parameters of Log-normal distribution of STELLAR RADII (from EDA_ZERO.ipynb) fitted to known Kepler long cadence lightcurves with exoplanets
ST_RAD_LOGNORM_SHAPE = 0.298937          # sigma
ST_RAD_LOGNORM_LOC = 0.0               # location, must be 0
ST_RAD_LOGNORM_SCALE = 0.961700         # exp(mu)

# Parameters of Normal distribution of STELLAR MASSES (from EDA_ZERO.ipynb) fitted to known Kepler long cadence lightcurves with exoplanets
ST_MASS_LOC = 0.963588   # mu
ST_MASS_SCALE = 0.193144 # sigma

os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(SIMULATED_CURVES_PATH, exist_ok=True)

In [None]:
# Clean and store resampled Kepler lightcurves
i=1
max_files = len(os.listdir(CHOSEN_EXOPLANET_CURVES))
for filename in os.listdir(CHOSEN_EXOPLANET_CURVES):
    if filename.endswith("_4q_curve.csv"):
      file_path = os.path.join(CHOSEN_EXOPLANET_CURVES, filename)

      # Load CSV into DataFrame
      lc_df = pd.read_csv(file_path)
      # Extract planet name
      planet_name = filename.replace("_4q_curve.csv", "")
      planet_name = planet_name.replace("_", " ")
      # Normalize and De-Trend the Lightcurve
      lc_df = process_lightcurve(planet_name, lc_df)
      # Drop rows where 'flux' is NaN (i.e., was originally non-numeric)

      #if 'flux' in lc_df.columns:
      #    lc_df['flux'] = pd.to_numeric(lc_df['flux'], errors='coerce')
      #    lc_df = lc_df.dropna(subset=['flux'])
      lc_df = lc_df.rename(columns={'flux': 'value'})

      lc_df['planet_name'] = planet_name  # required for column_id
      lc_df.to_csv(os.path.join(CHOSEN_EXOPLANET_CLEAN_CURVES, f"{planet_name}_cleaned_4q_curve.csv"), index=False)
      i=i+1
      if i % 20 == 0:
        print(f"Processed {i} Planets of {max_files} ")

print(f"Completed {i} Planets of {max_files} ")

**NOTE:** The above Code timed out repeatedly due to Google Drive Quota violations, processing only a portion of the light curves.  hence I had to write code to check for the presence of Cleaned Curve, and if not, to repeat the process and extract features immediately afterwards.

In [None]:
from lightkurve import LightCurve
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d

def process_lightcurve(planet_name, lc_df):
    try:
        # Clean flux column
        lc_df['flux'] = pd.to_numeric(lc_df['flux'], errors='coerce')
        lc_df = lc_df.dropna(subset=['flux'])
        lc_df = lc_df.sort_values('time')

        # Build LightCurve object
        lc = LightCurve(time=lc_df['time'].values, flux=lc_df['flux'].values)
        lc = lc.remove_nans()
        lc = lc.flatten(window_length=401, sigma=2)
        lc = lc.remove_outliers(sigma=5)
        lc = lc.normalize().remove_nans()
        lc = lc.fill_gaps()

        # Check for uniform cadence
        dt = np.diff(lc.time.value)
        median_dt = np.median(dt)
        if not np.allclose(dt, median_dt, rtol=1e-2):
            # Interpolation needed
            print(f"           Resampling {planet_name} to uniform cadence...")

            uniform_time = np.arange(lc.time.value[0], lc.time.value[-1], median_dt)
            interp_func = interp1d(lc.time.value, lc.flux.value, kind='linear', bounds_error=False, fill_value='extrapolate')
            uniform_flux = interp_func(uniform_time)

            lc = LightCurve(time=uniform_time, flux=uniform_flux)

        # Final clean DataFrame for TSFresh
        cleaned_df = lc.to_pandas().reset_index()

        return cleaned_df

    except Exception as e:
        print(f"Error processing lightcurve: {e}")
        return pd.DataFrame()


In [None]:
def process_with_timeout(planet_name, lc_df, queue):
    try:
        cleaned_df = process_lightcurve(planet_name, lc_df)
        cleaned_df['planet_name'] = planet_name
        cleaned_df = cleaned_df.rename(columns={'flux': 'value'})
        queue.put(cleaned_df)
    except Exception as e:
        print(f"Error during processing {planet_name}: {e}")
        queue.put(None)

In [None]:


# Define directories
# Assume these are set: CHOSEN_EXOPLANET_CURVES, CHOSEN_EXOPLANET_CLEAN_CURVES

i = 1
max_files = len([f for f in os.listdir(CHOSEN_EXOPLANET_CURVES) if f.endswith("_4q_curve.csv")])

for filename in os.listdir(CHOSEN_EXOPLANET_CURVES):
    if filename.endswith("_4q_curve.csv"):
        # Get raw planet name
        planet_name = filename.replace("_4q_curve.csv", "")
        raw_planet_name = planet_name.replace("_", " ")

        # Check if cleaned file already exists
        cleaned_filename = f"{planet_name}_cleaned_4q_curve.csv"
        cleaned_path = os.path.join(CHOSEN_EXOPLANET_CLEAN_CURVES, cleaned_filename)
        if os.path.exists(cleaned_path):
            print(f"Skipping {planet_name} (already processed)")
            continue

        file_path = os.path.join(CHOSEN_EXOPLANET_CURVES, filename)

        try:
            lc_df = pd.read_csv(file_path)

            # Timeout-safe processing
            queue = Queue()
            p = Process(target=process_with_timeout, args=(raw_planet_name, lc_df, queue))
            p.start()
            p.join(timeout=40)  # timeout in seconds

            if p.is_alive():
                print(f"Timeout while processing {planet_name}, skipping.")
                p.terminate()
                p.join()
                continue

            result = queue.get()
            if result is not None and not result.empty:
                result.to_csv(cleaned_path, index=False)
                i += 1
                if i % 20 == 0:
                    print(f"Processed {i} Planets of {max_files}")
            else:
                print(f"Failed to process {planet_name} (no data returned)")

        except Exception as e:
            print(f"Error reading file {filename}: {e}")

print(f"Completed {i} Planets of {max_files}")


In [None]:
##########################
# Feature Extraction Code
##########################
import logging
from scipy.stats import lognorm
logging.getLogger('tsfresh.feature_extraction.settings').setLevel(logging.ERROR)

batch_size = 50

i=0
features = []

kepler_curve_periods = os.path.join(OUTPUT_PATH, "kepler_clean_curve_tsfresh_features.csv")
kepler_curve_features_filename = os.path.join(OUTPUT_PATH, "kepler_clean_curve_tsfresh_features.csv")
print(kepler_curve_features_filename)

# Ensure fresh output files
if os.path.exists(kepler_curve_features_filename):
    os.remove(kepler_curve_features_filename)

print("Extracting TSFresh features for Chosen Cleaned Kepler Exoplanets")
for filename in os.listdir(CHOSEN_EXOPLANET_CLEAN_CURVES):
    if filename.endswith("_cleaned_4q_curve.csv"):
      file_path = os.path.join(CHOSEN_EXOPLANET_CLEAN_CURVES, filename)

      # Load CSV into DataFrame
      lc_df = pd.read_csv(file_path)
      # Extract planet name
      planet_name = filename.replace("_cleaned_4q_curve.csv", "")
      planet_name = planet_name.replace("_", " ")

      if 'flux' in lc_df.columns:
        lc_df = lc_df.rename(columns={'flux': 'value'})

      #lc_df['planet_name'] = planet_name  # required for column_id
      feats = extract_features(
          lc_df,
          column_id='planet_name',
          column_sort='time',
          column_value='value',
          default_fc_parameters=EfficientFCParameters(),
          disable_progressbar=True,
          show_warnings=False,
          n_jobs=0
      )
      feats.index.name = 'planet_name'
      features.append(feats)
      #if (i + 1) % batch_size == 0:
      #  print(f'Extracted Features for  Planet {i+1}: {planet_name}')

      i=i+1
      # Process and save batch
      if (i + 1) % batch_size == 0:
          print(f"Processing batch ending at Planet {i + 1}")
          features_df = pd.concat(features)
          features_df.reset_index(inplace=True)  # makes 'planet_name' a column

          features_df.to_csv(kepler_curve_features_filename, mode='a', header=True, index=False)

          # Clear batch memory
          features.clear()

print(f"Finished Generating {i + 1} TSFresh features for chosen Kepler curves")
print(f"Features saved to {kepler_curve_features_filename}")

/content/drive/MyDrive/Berkeley_AIML/Capstone/lightcurves/orbital_prediction_simcurve_XGB_model1/kepler_clean_curve_tsfresh_features.csv
Extracting TSFresh features for Chosen Cleaned Kepler Exoplanets
Processing batch ending at Planet 50
Processing batch ending at Planet 100
Processing batch ending at Planet 150
Processing batch ending at Planet 200
Processing batch ending at Planet 250
Processing batch ending at Planet 300
Processing batch ending at Planet 350
Processing batch ending at Planet 400
Processing batch ending at Planet 450
Processing batch ending at Planet 500
Processing batch ending at Planet 550
Processing batch ending at Planet 600
Processing batch ending at Planet 650
Processing batch ending at Planet 700
Processing batch ending at Planet 750
Processing batch ending at Planet 800
Finished Generating 802 TSFresh features for chosen Kepler curves
Features saved to /content/drive/MyDrive/Berkeley_AIML/Capstone/lightcurves/orbital_prediction_simcurve_XGB_model1/kepler_cle