In [243]:
import pandas as pd 

url1 = 'KOI.csv'
kepler = pd.read_csv(url1, comment='#')

In [244]:
drop = ['kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_teq_err1', 'koi_teq_err2', 'koi_tce_plnt_num', 'koi_tce_delivname', 'ra', 'dec', 'koi_kepmag']
kepler_df = kepler.drop(drop, axis=1).copy()

kepler_df

Unnamed: 0,kepid,koi_disposition,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,...,koi_model_snr,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2
0,10797460,CONFIRMED,9.488036,2.775000e-05,-2.775000e-05,170.538750,0.002160,-0.002160,0.146,0.318,...,35.8,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061
1,10797460,CONFIRMED,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,0.586,0.059,...,25.8,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061
2,10811496,CANDIDATE,19.899140,1.494000e-05,-1.494000e-05,175.850252,0.000581,-0.000581,0.969,5.126,...,76.3,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078
3,10848459,FALSE POSITIVE,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276,0.115,...,505.6,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067
4,10854555,CONFIRMED,2.525592,3.761000e-06,-3.761000e-06,171.595550,0.001130,-0.001130,0.701,0.235,...,40.9,6031.0,169.0,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,FALSE POSITIVE,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,1.252,0.051,...,453.3,5638.0,139.0,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079
9560,10128825,CANDIDATE,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,0.043,0.423,...,10.6,6119.0,165.0,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114
9561,10147276,FALSE POSITIVE,0.681402,2.434000e-06,-2.434000e-06,132.181750,0.002850,-0.002850,0.147,0.309,...,12.3,6173.0,193.0,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114
9562,10155286,CANDIDATE,333.486169,4.235000e-03,-4.235000e-03,153.615010,0.005070,-0.005070,0.214,0.255,...,14.0,4989.0,39.0,-128.0,2.992,0.030,-0.027,7.824,0.223,-1.896


In [245]:
kepler_candidates_df = kepler_df[kepler_df["koi_disposition"].str.strip().str.upper() == "CANDIDATE"].copy()

kepler_labeled_df = kepler_df[kepler_df["koi_disposition"].str.strip().str.upper().isin(["CONFIRMED","FALSE POSITIVE"])].copy()

kepler_labeled_df["label"] = (
    kepler_labeled_df["koi_disposition"]
    .str.strip().str.upper()
    .map({"CONFIRMED": 1, "FALSE POSITIVE": 0})
)

kepler_labeled_df.drop(["koi_disposition"], axis=1, inplace=True)

In [246]:
feat_cols = [c for c in kepler_labeled_df.columns if c not in ['kepid', 'label']]
X = kepler_labeled_df.drop(['kepid', 'label'], axis=1)
y = kepler_labeled_df["label"].astype('int')
group_all = kepler_labeled_df["kepid"]

In [247]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=group_all))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = group_all.iloc[train_idx]

In [248]:
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=feat_cols)
kepler_candidates_df = pd.DataFrame(imputer.fit_transform(kepler_candidates_df[feat_cols]), columns=feat_cols)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=feat_cols)
kepler_candidates_df_scaled = pd.DataFrame(scaler.fit_transform(kepler_candidates_df[feat_cols]), columns=feat_cols)

In [249]:
import json

X_train.to_csv("kepler/X_train_imputed_scaled.csv")
X_test.to_csv("kepler/X_test_imputed_scaled.csv")
y_train.to_csv("kepler/y_train.csv", index=True, header=["label"])
y_test.to_csv("kepler/y_test.csv",  index=True, header=["label"])
kepler_candidates_df_scaled.to_csv("candidates/kepler_candidates_scaled.csv")
kepler_candidates_df.to_csv("candidates/kepler_candidates.csv")
groups_train.to_csv("kepler/groups.csv", index=True, header=["kepid"])
meta = {
    "imputer": {"strategy": imputer.strategy},
    "feat_cols": feat_cols,
    "group_key": "kepid",
    "test_size": 0.2,
    "random_state": 42,
}
json.dump(meta, open("kepler/split_impute_meta.json","w"), indent=2)