In [1]:
# Import statements and main classes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os
import pickle
from tqdm import tqdm
from celara import KeplerLCPreprocessor, KeplerFeatureExtractor
from celara_utils import *

2025-10-05 03:44:00.220453: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-05 03:44:00.276141: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-05 03:44:00.276186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-05 03:44:00.278636: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-05 03:44:00.290678: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-05 03:44:00.292162: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
# Parameters
DATA_PATH = "data" # Relative path to data directory

# Check if KOI data already downloaded. Set to False to force re-download.
KOI_DOWNLOADED_ALREADY = os.path.exists(f"{DATA_PATH}/kepler_koi_dr25.csv") 

# Random sample number of light curves to download. Skips if downloaded or set to 0. Stops at max of koi
LIGHT_CURVES_TO_DOWNLOAD = 500
BALANCED_SUBSET_DOWNLOADED = os.path.exists(f"{DATA_PATH}/koi_balanced_subset.csv")

# 1.Load Data (KOI, Lightcurves)

In [3]:
# Download KOI (Kepler Objects of Interest ie stars) catalog from NASA Exoplanet Archive using TAP service
url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+*+from+q1_q17_dr25_koi&format=csv"


if not KOI_DOWNLOADED_ALREADY and not BALANCED_SUBSET_DOWNLOADED:
    koi_df = pd.read_csv(url)
    koi_df.to_csv(f"{DATA_PATH}/kepler_koi_dr25.csv", index=False)
else:
    koi_df = pd.read_csv(f"{DATA_PATH}/kepler_koi_dr25.csv")

koi_df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,ra,ra_err,ra_str,dec,dec_err,dec_str,koi_gmag,...,koi_fpflag_co,koi_fpflag_ec,koi_insol,koi_insol_err1,koi_insol_err2,koi_srho,koi_srho_err1,koi_srho_err2,koi_fittype,koi_score
0,10811496,K00753.01,,297.00482,0.0,19h48m01.16s,48.134129,0.0,+48d08m02.9s,15.943,...,0,0,39.3,31.04,-10.49,7.29555,35.03293,-2.75453,LS+MCMC,0.0
1,10848459,K00754.01,,285.53461,0.0,19h02m08.31s,48.28521,0.0,+48d17m06.8s,16.1,...,0,0,891.96,668.95,-230.35,0.2208,0.00917,-0.01837,LS+MCMC,0.0
2,10854555,K00755.01,Kepler-664 b,288.75488,0.0,19h15m01.17s,48.2262,0.0,+48d13m34.3s,16.015,...,0,0,926.16,874.33,-314.24,1.98635,2.71141,-1.74541,LS+MCMC,1.0
3,10872983,K00756.01,Kepler-228 d,296.28613,0.0,19h45m08.67s,48.22467,0.0,+48d13m28.8s,16.234,...,0,0,114.81,112.85,-36.7,0.67324,0.33286,-0.38858,LS+MCMC,1.0
4,10872983,K00756.02,Kepler-228 c,296.28613,0.0,19h45m08.67s,48.22467,0.0,+48d13m28.8s,16.234,...,0,0,427.65,420.33,-136.7,0.37377,0.74768,-0.26357,LS+MCMC,1.0


In [4]:
# Keep only confirmed planets and false positives
removed_candidates = koi_df[~koi_df["koi_disposition"].isin(["CONFIRMED", "FALSE POSITIVE"])]
koi_df = koi_df[koi_df["koi_disposition"].isin(["CONFIRMED", "FALSE POSITIVE"])].copy()

# Map to numeric labels
label_map = {"CONFIRMED": 1, "FALSE POSITIVE": 0}
koi_df["label"] = koi_df["koi_disposition"].map(label_map)

print("Removed candidates (not confirmed or false positive):", removed_candidates.shape[0])
print(koi_df["koi_disposition"].value_counts())
print("Labeled dataset ready, shape:", koi_df.shape)
koi_df.to_csv(f"{DATA_PATH}/kepler_koi_dr25_cleaned.csv", index=False)

Removed candidates (not confirmed or false positive): 1360
koi_disposition
FALSE POSITIVE    3965
CONFIRMED         2729
Name: count, dtype: int64
Labeled dataset ready, shape: (6694, 154)


In [None]:
# Download N light curves with balanced sampling (equal confirmed/false positives)
# This ensures class balance for machine learning training
if LIGHT_CURVES_TO_DOWNLOAD == 0:
    if os.path.exists(f"{DATA_PATH}/lightcurves"):
        print(f"\nLightcurves already downloadeded and expected at '{DATA_PATH}/lightcurves', skipping download step.")
    else:
        print(f"\nLightcurves download flag is set, but {DATA_PATH}/lightcurves does not exist. Please check.")
else:
    # Calculate balanced sample sizes
    confirmed = koi_df[koi_df["koi_disposition"] == "CONFIRMED"]
    false_positive = koi_df[koi_df["koi_disposition"] == "FALSE POSITIVE"]

    # Take equal numbers from each class (half of LIGHT_CURVES_TO_DOWNLOAD each)
    samples_per_class = LIGHT_CURVES_TO_DOWNLOAD // 2

    print(f"Available data:")
    print(f"  Confirmed planets: {len(confirmed)}")
    print(f"  False positives: {len(false_positive)}")
    print(f"\nBalanced sampling:")
    print(f"  Taking {samples_per_class} from each class = {samples_per_class * 2} total")

    # Sample equal numbers from each class
    confirmed_sample = confirmed.sample(samples_per_class, random_state=42)
    fp_sample = false_positive.sample(samples_per_class, random_state=42)

    # Combine and shuffle
    balanced_subset = pd.concat([confirmed_sample, fp_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Verify balance
    print(f"\nBalanced subset verification:")
    print(balanced_subset["koi_disposition"].value_counts())
    print(f"Total samples: {len(balanced_subset)}")

    # Save balanced subset for model training phase
    balanced_subset.to_csv("data/koi_balanced_subset.csv", index=False)
    print(f"Saved balanced dataset: data/koi_balanced_subset.csv")

    # Download lightcurves for balanced subset
    subset_results = fetch_lightcurves(balanced_subset["kepid"].tolist(), sleep=1, overwrite=False)
    print(f"\nDownload results:")
    print(subset_results["success"].value_counts())
    if "error" in subset_results.columns:
        print("Error breakdown:")
        print(subset_results['error'].value_counts())

Available data:
  Confirmed planets: 2729
  False positives: 3965

Balanced sampling:
  Taking 250 from each class = 500 total

Balanced subset verification:
koi_disposition
FALSE POSITIVE    250
CONFIRMED         250
Name: count, dtype: int64
Total samples: 500
Saved balanced dataset: data/koi_balanced_subset.csv


Fetching Kepler light curves:   0%|          | 0/500 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 2. Processing (Masking, normalisation, detrending, feature extraction)

In [None]:
# Process dataset to extract features
# This includes loading light curves, preprocessing, and feature extraction
# Results saved to .npz files for easy loading in ML frameworks

# Use consistent variable names
lightcurve_dir = f"{DATA_PATH}/lightcurves"

# Determine which dataset to use (balanced_subset if available)
if 'balanced_subset' in locals():
    dataset_to_process = balanced_subset
    print(f"Using balanced_subset: {dataset_to_process.shape}")
    print(f"Class balance: {dataset_to_process['koi_disposition'].value_counts().to_dict()}")
else:
    print("❌ No balanced_subset found. Run cell 6 first to create balanced dataset.")
    dataset_to_process = None

if dataset_to_process is not None:
    # Run processing on the balanced dataset
    print(f"\n🔧 Processing {len(dataset_to_process)} samples from balanced dataset...")
    X, y = process_kepler_dataset(dataset_to_process, lightcurve_dir, path=DATA_PATH, save_name="features_balanced")

    if X is not None:
        print(f"\n📊 Processing Results:")
        print(f"   Features shape: {X.shape}")
        print(f"   Labels: {np.bincount(y)} (0=FP, 1=Confirmed)")
        print(f"   Ready for neural network training!")
        
        # To process only a subset for demo, use:
        # X_demo, y_demo = process_kepler_dataset(dataset_to_process, lightcurve_dir, max_samples=3, path=DATA_PATH, save_name="features_demo")
else:
    print("Cannot process - no dataset available")

Using balanced_subset: (16, 154)
Class balance: {'CONFIRMED': 8, 'FALSE POSITIVE': 8}

🔧 Processing 16 samples from balanced dataset...
Processing 16 samples from dataset...


Building file index:   0%|          | 0/16 [00:00<?, ?it/s]

File index: 16 files
Available lightcurves: 16/16


Processing lightcurves:   0%|          | 0/16 [00:00<?, ?it/s]

✅ Successfully processed 16 samples
   Shape: (16, 2206)
   Labels: [8 8] (0=FP, 1=Confirmed)
   Saved: data/features_balanced.npz

📊 Processing Results:
   Features shape: (16, 2206)
   Labels: [8 8] (0=FP, 1=Confirmed)
   Ready for neural network training!
