In [2]:
import pandas as pd 

url1 = 'KOI.csv'
url2 = 'TOI.csv'
url3 = 'K2P.csv'

kepler = pd.read_csv(url1, comment='#')
tess = pd.read_csv(url2, comment='#')
k2 = pd.read_csv(url3, comment='#')

In [3]:
import numpy as np

kepler_map = {
    "koi_period": "period_d",
    "koi_prad": "rp_re",
    "koi_insol": "insol_eflux",
    "koi_teq": "eq_temp",
    "koi_steff": "teff_k",
    "koi_slogg": "logg_cgs",
    "koi_srad": "rstar_rsun",
    "ra": "ra_deg",
    "dec": "dec_deg",
}

tess_map = {
    "pl_orbper": "period_d",
    "pl_rade": "rp_re",
    "pl_insol": "insol_eflux",
    "pl_eqt": "eq_temp",
    "st_teff": "teff_k",
    "st_logg": "logg_cgs",
    "st_rad": "rstar_rsun",
    "ra": "ra_deg",
    "dec": "dec_deg",
}

k2_map = {
    "pl_orbper": "period_d",
    "pl_rade": "rp_re",
    "pl_insol": "insol_eflux",
    "pl_eqt": "eq_temp",
    "st_teff": "teff_k",
    "st_logg": "logg_cgs",
    "st_rad": "rstar_rsun",
    "ra": "ra_deg",
    "dec": "dec_deg",
}

def map_label_kepler(s):
    s = str(s).strip().upper()
    if s == "CONFIRMED": return 1
    if s == "FALSE POSITIVE": return 0
    return np.nan  

def map_label_tess(s):
    s = str(s).strip().upper()
    if s == "KP" or s == "CP": return 1
    if s == "FP" or s == "FA": return 0
    return np.nan

def map_label_k2(s):
    s = str(s).strip().upper()
    if s == "CONFIRMED": return 1
    if s == "FALSE POSITIVE": return 0
    return np.nan

def harmonize_and_split(df, cmap, mission, label_col, label_func, candidate_values):
    d = df.rename(columns=cmap).copy()
    d["mission"] = mission
    d["raw_label"] = d[label_col].astype(str).str.strip().str.upper()
    d["label"] = d[label_col].map(label_func)

    # star_key (≈1 arcsec bins)
    d["star_key"] = d["ra_deg"].round(5).astype(str) + "_" + d["dec_deg"].round(5).astype(str)

    cand_mask = d["raw_label"].isin({s.upper() for s in candidate_values})
    candidates_df = d[cand_mask].copy()
    labeled_df = d[~cand_mask & d["label"].isin([0, 1])].copy()
    return labeled_df, candidates_df

k_kepler, k_kepler_c = harmonize_and_split(
    kepler, kepler_map, "KEPLER", "koi_disposition", map_label_kepler,
    candidate_values=["CANDIDATE"]
)

k_tess, k_tess_c = harmonize_and_split(
    tess, tess_map, "TESS", "tfopwg_disp", map_label_tess,
    candidate_values=["PC", "APC"]
)

k_k2, k_k2_c = harmonize_and_split(
    k2, k2_map, "K2", "disposition", map_label_k2,
    candidate_values=["CANDIDATE"]
)

In [4]:
k_kepler["label"].isnull().sum()/k_kepler["label"].value_counts()

label
0.0    0.0
1.0    0.0
Name: count, dtype: float64

In [5]:
strict_cols = [
    "period_d", "rp_re", "insol_eflux", "eq_temp", "teff_k", "logg_cgs", "rstar_rsun",
    "ra_deg", "dec_deg", "mission", "label", "star_key"
] 

kepler_u = k_kepler[[c for c in strict_cols if c in k_kepler.columns]]
tess_u = k_tess[[c for c in strict_cols if c in k_tess.columns]]
k2_u = k_k2[[c for c in strict_cols if c in k_k2.columns]]

kepler_c_u = k_kepler_c[[c for c in strict_cols if c in k_kepler_c.columns]]
tess_c_u = k_tess_c[[c for c in strict_cols if c in k_tess_c.columns]]
k2_c_u = k_k2_c[[c for c in strict_cols if c in k_k2_c.columns]]

all_u = pd.concat([kepler_u, tess_u, k2_u], ignore_index=True)
all_c_u = pd.concat([kepler_c_u, tess_c_u, k2_c_u], ignore_index=True)

print(f"Rows before dropping: {all_c_u.shape[0]}")
all_u = all_u.dropna(subset=["label"]).copy()
print(f"Rows in the middle of dropping: {all_c_u.shape[0]}")
all_u = all_u.drop_duplicates(subset=[c for c in strict_cols if c in all_u.columns])
print(f"Rows after dropping: {all_c_u.shape[0]}")

feat_cols = [
    "period_d", "rp_re", "insol_eflux", "eq_temp", "teff_k", "logg_cgs", "rstar_rsun"         
]
groups_all = all_u["star_key"]

all_u

Rows before dropping: 8482
Rows in the middle of dropping: 8482
Rows after dropping: 8482


Unnamed: 0,period_d,rp_re,insol_eflux,eq_temp,teff_k,logg_cgs,rstar_rsun,ra_deg,dec_deg,mission,label,star_key
0,9.488036,2.26000,93.5900,793.00,5455.0,4.46700,0.927000,291.934230,48.141651,KEPLER,1.0,291.93423_48.14165
1,54.418383,2.83000,9.1100,443.00,5455.0,4.46700,0.927000,291.934230,48.141651,KEPLER,1.0,291.93423_48.14165
2,1.736952,33.46000,891.9600,1395.00,5805.0,4.56400,0.791000,285.534610,48.285210,KEPLER,0.0,285.53461_48.28521
3,2.525592,2.75000,926.1600,1406.00,6031.0,4.43800,1.046000,288.754880,48.226200,KEPLER,1.0,288.75488_48.2262
4,11.094321,3.90000,114.8100,835.00,6046.0,4.48600,0.972000,296.286130,48.224670,KEPLER,1.0,296.28613_48.22467
...,...,...,...,...,...,...,...,...,...,...,...,...
12707,2.484197,,,,,,,342.385899,-10.675469,K2,1.0,342.3859_-10.67547
12716,6.001180,2.03000,69.6000,805.00,4716.0,4.62000,0.690000,206.846198,-6.139337,K2,1.0,206.8462_-6.13934
12717,6.001270,2.04300,,790.00,4716.0,4.62000,0.689000,206.846198,-6.139337,K2,1.0,206.8462_-6.13934
12718,,1.75752,12.6807,481.29,4759.0,4.70975,0.641805,206.846198,-6.139337,K2,1.0,206.8462_-6.13934


In [6]:
all_c_u["label"].value_counts()

Series([], Name: count, dtype: int64)

In [7]:
all_u["label"].value_counts()

label
0.0    6422
1.0    6244
Name: count, dtype: int64

In [8]:
all_u.isnull().sum() / all_u.shape[0]

period_d       0.004342
rp_re          0.073583
insol_eflux    0.178430
eq_temp        0.169825
teff_k         0.063793
logg_cgs       0.106427
rstar_rsun     0.037265
ra_deg         0.000079
dec_deg        0.000079
mission        0.000000
label          0.000000
star_key       0.000000
dtype: float64

In [9]:
X = all_u[feat_cols]
y = all_u["label"].astype('int')

In [10]:
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups_all))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = groups_all.iloc[train_idx]

imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=feat_cols)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=feat_cols)

In [11]:
import json

X_train.to_csv("merged/X_train_imputed_scaled.csv")
X_test.to_csv("merged/X_test_imputed_scaled.csv")
y_train.to_csv("merged/y_train.csv", index=True, header=["label"])
y_test.to_csv("merged/y_test.csv",  index=True, header=["label"])
groups_train.to_csv("merged/groups.csv", index=True, header=["star_key"])
all_c_u.to_csv("candidates/merged_candidates.csv")

meta = {
    "imputer": {"strategy": imputer.strategy},
    "feat_cols": feat_cols,
    "group_key": "star_key",
    "test_size": 0.2,
    "random_state": 42,
}
json.dump(meta, open("merged/split_impute_meta.json","w"), indent=2)