In [19]:
import pandas as pd 

url1 = '/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/KOI.csv'
kepler = pd.read_csv(url1, comment='#')

In [20]:
drop = ['kepoi_name', 'kepler_name', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_teq_err1', 'koi_teq_err2', 'koi_tce_plnt_num', 'koi_tce_delivname', 'ra', 'dec', 'koi_kepmag']
kepler_df = kepler.drop(drop, axis=1).copy()
kepler_df = kepler_df.drop(kepler_df.filter(regex="(_err1|_err2)$").columns, axis=1)
kepler_df

Unnamed: 0,kepid,koi_disposition,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad
0,10797460,CONFIRMED,9.488036,170.538750,0.146,2.95750,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927
1,10797460,CONFIRMED,54.418383,162.513840,0.586,4.50700,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927
2,10811496,CANDIDATE,19.899140,175.850252,0.969,1.78220,10829.0,14.60,638.0,39.30,76.3,5853.0,4.544,0.868
3,10848459,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791
4,10854555,CONFIRMED,2.525592,171.595550,0.701,1.65450,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,FALSE POSITIVE,0.527699,131.705093,1.252,3.22210,1579.2,29.35,2088.0,4500.53,453.3,5638.0,4.529,0.903
9560,10128825,CANDIDATE,1.739849,133.001270,0.043,3.11400,48.5,0.72,1608.0,1585.81,10.6,6119.0,4.444,1.031
9561,10147276,FALSE POSITIVE,0.681402,132.181750,0.147,0.86500,103.6,1.07,2218.0,5713.41,12.3,6173.0,4.447,1.041
9562,10155286,CANDIDATE,333.486169,153.615010,0.214,3.19900,639.1,19.30,557.0,22.68,14.0,4989.0,2.992,7.824


In [21]:
kepler_candidates_df = kepler_df[kepler_df["koi_disposition"].str.strip().str.upper() == "CANDIDATE"].copy()

kepler_labeled_df = kepler_df[kepler_df["koi_disposition"].str.strip().str.upper().isin(["CONFIRMED","FALSE POSITIVE"])].copy()

kepler_labeled_df["label"] = (
    kepler_labeled_df["koi_disposition"]
    .str.strip().str.upper()
    .map({"CONFIRMED": 1, "FALSE POSITIVE": 0})
)

kepler_labeled_df.drop(["koi_disposition"], axis=1, inplace=True)

In [22]:
feat_cols = [c for c in kepler_labeled_df.columns if c not in ['kepid', 'label']]
X = kepler_labeled_df.drop(['kepid', 'label'], axis=1)
y = kepler_labeled_df["label"].astype('int')
group_all = kepler_labeled_df["kepid"]

In [23]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=group_all))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
groups_train = group_all.iloc[train_idx]

In [24]:
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=feat_cols)
kepler_candidates_df = pd.DataFrame(imputer.fit_transform(kepler_candidates_df[feat_cols]), columns=feat_cols)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feat_cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=feat_cols)
kepler_candidates_df_scaled = pd.DataFrame(scaler.fit_transform(kepler_candidates_df[feat_cols]), columns=feat_cols)

In [25]:
import json

X_train.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/X_train_imputed_scaled.csv")
X_test.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/X_test_imputed_scaled.csv")
y_train.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/y_train.csv", index=True, header=["label"])
y_test.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/y_test.csv",  index=True, header=["label"])
kepler_candidates_df_scaled.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/candidates/kepler_candidates_scaled.csv")
kepler_candidates_df.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/candidates/kepler_candidates.csv")
groups_train.to_csv("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/groups.csv", index=True, header=["kepid"])
meta = {
    "imputer": {"strategy": imputer.strategy},
    "feat_cols": feat_cols,
    "group_key": "kepid",
    "test_size": 0.2,
    "random_state": 42,
}
json.dump(meta, open("/Users/janma/Desktop/SpaceAppsExoplanets/src/backend/datasets/kepler/split_impute_meta.json","w"), indent=2)