In [129]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

In [131]:
#Import tensor flow libraries
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [132]:
kepler_df = pd.read_csv("Resources/kepler_dataset.csv")
pd.set_option('display.max_columns', None)

kepler_df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_comment,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_time0,koi_time0_err1,koi_time0_err2,koi_eccen,koi_eccen_err1,koi_eccen_err2,koi_longp,koi_longp_err1,koi_longp_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_ingress,koi_ingress_err1,koi_ingress_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_ror,koi_ror_err1,koi_ror_err2,koi_srho,koi_srho_err1,koi_srho_err2,koi_fittype,koi_prad,koi_prad_err1,koi_prad_err2,koi_sma,koi_sma_err1,koi_sma_err2,koi_incl,koi_incl_err1,koi_incl_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_limbdark_mod,koi_ldm_coeff4,koi_ldm_coeff3,koi_ldm_coeff2,koi_ldm_coeff1,koi_parm_prov,koi_max_sngle_ev,koi_max_mult_ev,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_tce_delivname,koi_quarters,koi_bin_oedp_sig,koi_trans_mod,koi_model_dof,koi_model_chisq,koi_datalink_dvr,koi_datalink_dvs,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,koi_srad_err1,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sage,koi_sage_err1,koi_sage_err2,koi_sparprov,ra,dec,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sra_err,koi_fwm_sdec,koi_fwm_sdec_err,koi_fwm_srao,koi_fwm_srao_err,koi_fwm_sdeco,koi_fwm_sdeco_err,koi_fwm_prao,koi_fwm_prao_err,koi_fwm_pdeco,koi_fwm_pdeco_err,koi_dicco_mra,koi_dicco_mra_err,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,-0.00216,2455003.539,0.00216,-0.00216,0.0,,,,,,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,,,,616.0,19.5,-19.5,0.022344,0.000832,-0.000528,3.20796,0.33173,-1.09986,LS+MCMC,2.26,0.26,-0.15,0.0853,,,89.66,,,793.0,,,93.59,29.45,-16.65,24.81,2.6,-2.6,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,5.135849,28.47082,35.8,2,142.0,1.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.6864,Mandel and Agol (2002 ApJ 580 171),,,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,,,,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.002,19.462294,1.4e-05,48.14191,0.00013,0.43,0.51,0.94,0.48,-0.0002,0.00032,-0.00055,0.00031,-0.01,0.13,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,2454995.514,0.00352,-0.00352,0.0,,,,,,0.586,0.059,-0.443,4.507,0.116,-0.116,,,,875.0,35.5,-35.5,0.027954,0.00908,-0.00135,3.02368,2.20489,-2.49638,LS+MCMC,2.83,0.32,-0.19,0.2734,,,89.57,,,443.0,,,9.11,2.87,-1.62,77.9,28.4,-28.4,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,7.027669,20.109507,25.8,2,25.0,2.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.0023,Mandel and Agol (2002 ApJ 580 171),,,010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,,,,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.003,19.462265,2e-05,48.14199,0.00019,-0.63,0.72,1.23,0.68,0.00066,0.00065,-0.00105,0.00063,0.39,0.36,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,10811496,K00753.01,,CANDIDATE,Done,2018-08-16,CANDIDATE,0.0,0,0,0,0,q1_q17_dr25_sup_koi,DEEP_V_SHAPED,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,2455008.85,0.000581,-0.000581,0.0,,,,,,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,,,,10800.0,171.0,-171.0,0.154046,5.03,-0.0422,7.29555,35.03293,-2.75453,LS+MCMC,14.6,3.92,-1.31,0.1419,,,88.96,,,638.0,,,39.3,31.04,-10.49,53.5,25.7,-25.7,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2711,0.3858,q1_q17_dr25_koi,37.159767,187.4491,76.3,1,56.0,1.0,q1_q17_dr25_tce,11111101110111011000000000000000,0.6624,Mandel and Agol (2002 ApJ 580 171),,,010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...,5853.0,158.0,-176.0,4.544,0.044,-0.176,-0.18,0.3,-0.3,0.868,0.233,-0.078,0.961,0.11,-0.121,,,,q1_q17_dr25_stellar,297.00482,48.134129,15.436,15.943,15.39,15.22,15.166,14.254,13.9,13.826,0.278,19.800321,1.9e-06,48.13412,2e-05,-0.021,0.069,-0.038,0.071,0.0007,0.0024,0.0006,0.0034,-0.025,0.07,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,10848459,K00754.01,,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.0,0,1,0,0,q1_q17_dr25_sup_koi,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,2455003.308,0.000115,-0.000115,0.0,,,,,,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,,,,8080.0,12.8,-12.8,0.387394,0.109,-0.085,0.2208,0.00917,-0.01837,LS+MCMC,33.46,8.5,-2.83,0.0267,,,67.09,,,1395.0,,,891.96,668.95,-230.35,3.278,0.136,-0.136,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2865,0.3556,q1_q17_dr25_koi,39.06655,541.8951,505.6,1,621.0,1.0,q1_q17_dr25_tce,11111110111011101000000000000000,0.0,Mandel and Agol (2002 ApJ 580 171),,,010/010848/010848459/dv/kplr010848459-20160209...,010/010848/010848459/dv/kplr010848459-001-2016...,5805.0,157.0,-174.0,4.564,0.053,-0.168,-0.52,0.3,-0.3,0.791,0.201,-0.067,0.836,0.093,-0.077,,,,q1_q17_dr25_stellar,285.53461,48.28521,15.597,16.1,15.554,15.382,15.266,14.326,13.911,13.809,0.0,19.035638,8.6e-07,48.28521,7e-06,-0.111,0.031,0.002,0.027,0.00302,0.00057,-0.00142,0.00081,-0.249,0.072,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,2455004.596,0.00113,-0.00113,0.0,,,,,,0.701,0.235,-0.478,1.6545,0.042,-0.042,,,,603.0,16.9,-16.9,0.024064,0.00375,-0.00152,1.98635,2.71141,-1.74541,LS+MCMC,2.75,0.88,-0.35,0.0374,,,85.41,,,1406.0,,,926.16,874.33,-314.24,8.75,4.0,-4.0,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2844,0.3661,q1_q17_dr25_koi,4.749945,33.1919,40.9,1,515.0,1.0,q1_q17_dr25_tce,01111111111111111000000000000000,0.309,Mandel and Agol (2002 ApJ 580 171),,,010/010854/010854555/dv/kplr010854555-20160209...,010/010854/010854555/dv/kplr010854555-001-2016...,6031.0,169.0,-211.0,4.438,0.07,-0.21,0.07,0.25,-0.3,1.046,0.334,-0.133,1.095,0.151,-0.136,,,,q1_q17_dr25_stellar,288.75488,48.2262,15.509,16.015,15.468,15.292,15.241,14.366,14.064,13.952,0.733,19.250326,9.7e-06,48.22626,0.0001,-0.01,0.35,0.23,0.37,8e-05,0.0002,-7e-05,0.00022,0.03,0.19,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [133]:
# Documentation link 
# https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html

In [134]:
kepler_df.dtypes
#print(len(kepler_df.columns))

kepid                   int64
kepoi_name             object
kepler_name            object
koi_disposition        object
koi_vet_stat           object
koi_vet_date           object
koi_pdisposition       object
koi_score             float64
koi_fpflag_nt           int64
koi_fpflag_ss           int64
koi_fpflag_co           int64
koi_fpflag_ec           int64
koi_disp_prov          object
koi_comment            object
koi_period            float64
koi_period_err1       float64
koi_period_err2       float64
koi_time0bk           float64
koi_time0bk_err1      float64
koi_time0bk_err2      float64
koi_time0             float64
koi_time0_err1        float64
koi_time0_err2        float64
koi_eccen             float64
koi_eccen_err1        float64
koi_eccen_err2        float64
koi_longp             float64
koi_longp_err1        float64
koi_longp_err2        float64
koi_impact            float64
koi_impact_err1       float64
koi_impact_err2       float64
koi_duration          float64
koi_durati

In [135]:
kepler_df.shape

(9564, 140)

In [136]:
kepler_df.isna().sum()/len(kepler_df)

kepid                 0.000000
kepoi_name            0.000000
kepler_name           0.712986
koi_disposition       0.000000
koi_vet_stat          0.000000
koi_vet_date          0.000000
koi_pdisposition      0.000000
koi_score             0.157884
koi_fpflag_nt         0.000000
koi_fpflag_ss         0.000000
koi_fpflag_co         0.000000
koi_fpflag_ec         0.000000
koi_disp_prov         0.000000
koi_comment           0.126412
koi_period            0.000000
koi_period_err1       0.047470
koi_period_err2       0.047470
koi_time0bk           0.000000
koi_time0bk_err1      0.047470
koi_time0bk_err2      0.047470
koi_time0             0.000000
koi_time0_err1        0.047470
koi_time0_err2        0.047470
koi_eccen             0.037955
koi_eccen_err1        1.000000
koi_eccen_err2        1.000000
koi_longp             1.000000
koi_longp_err1        1.000000
koi_longp_err2        1.000000
koi_impact            0.037955
koi_impact_err1       0.047470
koi_impact_err2       0.047470
koi_dura

In [137]:
kepler_df.isnull().any(axis=1).sum()

9564

In [138]:
# Check if any column is completely NaN 
empty_columns = kepler_df.isnull().all() 
print(empty_columns) 
# Get a list of all completely empty columns
completely_nan_columns = empty_columns[empty_columns].index.tolist() 
print(f'Columns that are completely NaN: {completely_nan_columns}')

kepid                 False
kepoi_name            False
kepler_name           False
koi_disposition       False
koi_vet_stat          False
koi_vet_date          False
koi_pdisposition      False
koi_score             False
koi_fpflag_nt         False
koi_fpflag_ss         False
koi_fpflag_co         False
koi_fpflag_ec         False
koi_disp_prov         False
koi_comment           False
koi_period            False
koi_period_err1       False
koi_period_err2       False
koi_time0bk           False
koi_time0bk_err1      False
koi_time0bk_err2      False
koi_time0             False
koi_time0_err1        False
koi_time0_err2        False
koi_eccen             False
koi_eccen_err1         True
koi_eccen_err2         True
koi_longp              True
koi_longp_err1         True
koi_longp_err2         True
koi_impact            False
koi_impact_err1       False
koi_impact_err2       False
koi_duration          False
koi_duration_err1     False
koi_duration_err2     False
koi_ingress         

In [139]:
#remove empty columns
kepler_df_cleaned = kepler_df.drop(columns =['koi_eccen_err1', 'koi_eccen_err2', 'koi_longp', 'koi_longp_err1', 'koi_longp_err2', 'koi_ingress', 
                                     'koi_ingress_err1', 'koi_ingress_err2', 'koi_sma_err1', 'koi_sma_err2', 'koi_incl_err1', 
                                     'koi_incl_err2', 'koi_teq_err1', 'koi_teq_err2', 'koi_model_dof', 'koi_model_chisq', 'koi_sage', 
                                     'koi_sage_err1', 'koi_sage_err2', 'kepler_name'])
display(kepler_df_cleaned.head())
print(kepler_df_cleaned.shape)

Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_comment,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_time0,koi_time0_err1,koi_time0_err2,koi_eccen,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_ror,koi_ror_err1,koi_ror_err2,koi_srho,koi_srho_err1,koi_srho_err2,koi_fittype,koi_prad,koi_prad_err1,koi_prad_err2,koi_sma,koi_incl,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_limbdark_mod,koi_ldm_coeff4,koi_ldm_coeff3,koi_ldm_coeff2,koi_ldm_coeff1,koi_parm_prov,koi_max_sngle_ev,koi_max_mult_ev,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_tce_delivname,koi_quarters,koi_bin_oedp_sig,koi_trans_mod,koi_datalink_dvr,koi_datalink_dvs,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,koi_srad_err1,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sparprov,ra,dec,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sra_err,koi_fwm_sdec,koi_fwm_sdec_err,koi_fwm_srao,koi_fwm_srao_err,koi_fwm_sdeco,koi_fwm_sdeco_err,koi_fwm_prao,koi_fwm_prao_err,koi_fwm_pdeco,koi_fwm_pdeco_err,koi_dicco_mra,koi_dicco_mra_err,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,10797460,K00752.01,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,-0.00216,2455003.539,0.00216,-0.00216,0.0,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,616.0,19.5,-19.5,0.022344,0.000832,-0.000528,3.20796,0.33173,-1.09986,LS+MCMC,2.26,0.26,-0.15,0.0853,89.66,793.0,93.59,29.45,-16.65,24.81,2.6,-2.6,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,5.135849,28.47082,35.8,2,142.0,1.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.6864,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.002,19.462294,1.4e-05,48.14191,0.00013,0.43,0.51,0.94,0.48,-0.0002,0.00032,-0.00055,0.00031,-0.01,0.13,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,10797460,K00752.02,CONFIRMED,Done,2018-08-16,CANDIDATE,0.969,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,2454995.514,0.00352,-0.00352,0.0,0.586,0.059,-0.443,4.507,0.116,-0.116,875.0,35.5,-35.5,0.027954,0.00908,-0.00135,3.02368,2.20489,-2.49638,LS+MCMC,2.83,0.32,-0.19,0.2734,89.57,443.0,9.11,2.87,-1.62,77.9,28.4,-28.4,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,7.027669,20.109507,25.8,2,25.0,2.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.0023,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.003,19.462265,2e-05,48.14199,0.00019,-0.63,0.72,1.23,0.68,0.00066,0.00065,-0.00105,0.00063,0.39,0.36,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,10811496,K00753.01,CANDIDATE,Done,2018-08-16,CANDIDATE,0.0,0,0,0,0,q1_q17_dr25_sup_koi,DEEP_V_SHAPED,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,2455008.85,0.000581,-0.000581,0.0,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10800.0,171.0,-171.0,0.154046,5.03,-0.0422,7.29555,35.03293,-2.75453,LS+MCMC,14.6,3.92,-1.31,0.1419,88.96,638.0,39.3,31.04,-10.49,53.5,25.7,-25.7,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2711,0.3858,q1_q17_dr25_koi,37.159767,187.4491,76.3,1,56.0,1.0,q1_q17_dr25_tce,11111101110111011000000000000000,0.6624,Mandel and Agol (2002 ApJ 580 171),010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...,5853.0,158.0,-176.0,4.544,0.044,-0.176,-0.18,0.3,-0.3,0.868,0.233,-0.078,0.961,0.11,-0.121,q1_q17_dr25_stellar,297.00482,48.134129,15.436,15.943,15.39,15.22,15.166,14.254,13.9,13.826,0.278,19.800321,1.9e-06,48.13412,2e-05,-0.021,0.069,-0.038,0.071,0.0007,0.0024,0.0006,0.0034,-0.025,0.07,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,10848459,K00754.01,FALSE POSITIVE,Done,2018-08-16,FALSE POSITIVE,0.0,0,1,0,0,q1_q17_dr25_sup_koi,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,2455003.308,0.000115,-0.000115,0.0,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8080.0,12.8,-12.8,0.387394,0.109,-0.085,0.2208,0.00917,-0.01837,LS+MCMC,33.46,8.5,-2.83,0.0267,67.09,1395.0,891.96,668.95,-230.35,3.278,0.136,-0.136,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2865,0.3556,q1_q17_dr25_koi,39.06655,541.8951,505.6,1,621.0,1.0,q1_q17_dr25_tce,11111110111011101000000000000000,0.0,Mandel and Agol (2002 ApJ 580 171),010/010848/010848459/dv/kplr010848459-20160209...,010/010848/010848459/dv/kplr010848459-001-2016...,5805.0,157.0,-174.0,4.564,0.053,-0.168,-0.52,0.3,-0.3,0.791,0.201,-0.067,0.836,0.093,-0.077,q1_q17_dr25_stellar,285.53461,48.28521,15.597,16.1,15.554,15.382,15.266,14.326,13.911,13.809,0.0,19.035638,8.6e-07,48.28521,7e-06,-0.111,0.031,0.002,0.027,0.00302,0.00057,-0.00142,0.00081,-0.249,0.072,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,10854555,K00755.01,CONFIRMED,Done,2018-08-16,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,2455004.596,0.00113,-0.00113,0.0,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.0,16.9,-16.9,0.024064,0.00375,-0.00152,1.98635,2.71141,-1.74541,LS+MCMC,2.75,0.88,-0.35,0.0374,85.41,1406.0,926.16,874.33,-314.24,8.75,4.0,-4.0,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2844,0.3661,q1_q17_dr25_koi,4.749945,33.1919,40.9,1,515.0,1.0,q1_q17_dr25_tce,01111111111111111000000000000000,0.309,Mandel and Agol (2002 ApJ 580 171),010/010854/010854555/dv/kplr010854555-20160209...,010/010854/010854555/dv/kplr010854555-001-2016...,6031.0,169.0,-211.0,4.438,0.07,-0.21,0.07,0.25,-0.3,1.046,0.334,-0.133,1.095,0.151,-0.136,q1_q17_dr25_stellar,288.75488,48.2262,15.509,16.015,15.468,15.292,15.241,14.366,14.064,13.952,0.733,19.250326,9.7e-06,48.22626,0.0001,-0.01,0.35,0.23,0.37,8e-05,0.0002,-7e-05,0.00022,0.03,0.19,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


(9564, 120)


In [140]:
#pd.set_option('display.max_rows', None)
print((kepler_df_cleaned.isna().sum()/len(kepler_df)).sort_values(ascending=False))
print(kepler_df_cleaned.isnull().any(axis=1).sum())

koi_bin_oedp_sig      0.157884
koi_score             0.157884
koi_comment           0.126412
koi_max_sngle_ev      0.119406
koi_max_mult_ev       0.119406
koi_num_transits      0.119406
koi_quarters          0.119406
koi_fwm_stat_sig      0.112505
koi_fwm_prao_err      0.086784
koi_fwm_prao          0.086784
koi_fwm_pdeco_err     0.085425
koi_fwm_pdeco         0.085425
koi_zmag              0.064095
koi_dicco_mdec_err    0.062631
koi_dicco_mdec        0.062631
koi_dicco_mra_err     0.062631
koi_dicco_msky_err    0.062631
koi_dicco_mra         0.062631
koi_dicco_msky        0.062631
koi_dikco_msky_err    0.059598
koi_dikco_mra         0.059598
koi_dikco_mra_err     0.059598
koi_dikco_mdec        0.059598
koi_dikco_mdec_err    0.059598
koi_dikco_msky        0.059598
koi_fwm_sra           0.052907
koi_fwm_sra_err       0.052907
koi_fwm_sdec          0.052907
koi_fwm_sdec_err      0.052907
koi_steff_err2        0.050502
koi_slogg_err1        0.048934
koi_slogg_err2        0.048934
koi_srad

In [141]:
print(f"before: {kepler_df_cleaned['koi_comment'].isna().sum()}")

before: 1209


In [142]:
# if the comment column is left empty it is logical to assume there is no comment
kepler_df_cleaned['koi_comment'] = kepler_df_cleaned['koi_comment'].fillna('NO_COMMENT')

print(f"after: {kepler_df_cleaned['koi_comment'].isna().sum()}")

after: 0


In [143]:
kepler_df_cleaned['koi_disposition'].value_counts()

koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2743
CANDIDATE         1982
Name: count, dtype: int64

# split the data

In [144]:
y = kepler_df_cleaned['koi_disposition']


In [145]:
#create X data, drop columns that might cause data leakage and columns from y data
X = kepler_df_cleaned.drop(columns = ['koi_disposition','koi_pdisposition' , 'kepid', 'kepoi_name', 'koi_vet_date'])
X.head(3)

Unnamed: 0,koi_vet_stat,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_comment,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_time0,koi_time0_err1,koi_time0_err2,koi_eccen,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_ror,koi_ror_err1,koi_ror_err2,koi_srho,koi_srho_err1,koi_srho_err2,koi_fittype,koi_prad,koi_prad_err1,koi_prad_err2,koi_sma,koi_incl,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_limbdark_mod,koi_ldm_coeff4,koi_ldm_coeff3,koi_ldm_coeff2,koi_ldm_coeff1,koi_parm_prov,koi_max_sngle_ev,koi_max_mult_ev,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_tce_delivname,koi_quarters,koi_bin_oedp_sig,koi_trans_mod,koi_datalink_dvr,koi_datalink_dvs,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,koi_srad_err1,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sparprov,ra,dec,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sra_err,koi_fwm_sdec,koi_fwm_sdec_err,koi_fwm_srao,koi_fwm_srao_err,koi_fwm_sdeco,koi_fwm_sdeco_err,koi_fwm_prao,koi_fwm_prao_err,koi_fwm_pdeco,koi_fwm_pdeco_err,koi_dicco_mra,koi_dicco_mra_err,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,Done,1.0,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,9.488036,2.8e-05,-2.8e-05,170.53875,0.00216,-0.00216,2455003.539,0.00216,-0.00216,0.0,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,616.0,19.5,-19.5,0.022344,0.000832,-0.000528,3.20796,0.33173,-1.09986,LS+MCMC,2.26,0.26,-0.15,0.0853,89.66,793.0,93.59,29.45,-16.65,24.81,2.6,-2.6,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,5.135849,28.47082,35.8,2,142.0,1.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.6864,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.002,19.462294,1.4e-05,48.14191,0.00013,0.43,0.51,0.94,0.48,-0.0002,0.00032,-0.00055,0.00031,-0.01,0.13,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,Done,0.969,0,0,0,0,q1_q17_dr25_sup_koi,NO_COMMENT,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,2454995.514,0.00352,-0.00352,0.0,0.586,0.059,-0.443,4.507,0.116,-0.116,875.0,35.5,-35.5,0.027954,0.00908,-0.00135,3.02368,2.20489,-2.49638,LS+MCMC,2.83,0.32,-0.19,0.2734,89.57,443.0,9.11,2.87,-1.62,77.9,28.4,-28.4,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,7.027669,20.109507,25.8,2,25.0,2.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.0023,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.003,19.462265,2e-05,48.14199,0.00019,-0.63,0.72,1.23,0.68,0.00066,0.00065,-0.00105,0.00063,0.39,0.36,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,Done,0.0,0,0,0,0,q1_q17_dr25_sup_koi,DEEP_V_SHAPED,19.89914,1.5e-05,-1.5e-05,175.850252,0.000581,-0.000581,2455008.85,0.000581,-0.000581,0.0,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10800.0,171.0,-171.0,0.154046,5.03,-0.0422,7.29555,35.03293,-2.75453,LS+MCMC,14.6,3.92,-1.31,0.1419,88.96,638.0,39.3,31.04,-10.49,53.5,25.7,-25.7,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2711,0.3858,q1_q17_dr25_koi,37.159767,187.4491,76.3,1,56.0,1.0,q1_q17_dr25_tce,11111101110111011000000000000000,0.6624,Mandel and Agol (2002 ApJ 580 171),010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...,5853.0,158.0,-176.0,4.544,0.044,-0.176,-0.18,0.3,-0.3,0.868,0.233,-0.078,0.961,0.11,-0.121,q1_q17_dr25_stellar,297.00482,48.134129,15.436,15.943,15.39,15.22,15.166,14.254,13.9,13.826,0.278,19.800321,2e-06,48.13412,2e-05,-0.021,0.069,-0.038,0.071,0.0007,0.0024,0.0006,0.0034,-0.025,0.07,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074


In [146]:
X['koi_comment'].value_counts()

koi_comment
NO_COMMENT                                                                                                                                                                                                            4318
CENT_KIC_POS                                                                                                                                                                                                           514
CENT_RESOLVED_OFFSET                                                                                                                                                                                                   173
MOD_SEC_DV---MOD_SEC_ALT---HAS_SEC_TCE                                                                                                                                                                                 165
CENT_FEW_DIFFS                                                                                                  

In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer
#use tf-idf to make numeric values for comments 

comments = X['koi_comment'] 

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer() 
comments_tfidf = vectorizer.fit_transform(comments)

# Convert to DataFrame if needed
import pandas as pd
comments_df = pd.DataFrame(comments_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
comments_df.head()


Unnamed: 0,01,all_trans_chases,alt_robo_odd_even_test_fail,alt_sec_same_depth_as_pri_could_be_twice_true_period,alt_sig_pri_minus_sig_pos_too_low,alt_sig_pri_minus_sig_ter_too_low,alt_sig_pri_over_fred_too_low,cent_crowded,cent_few_diffs,cent_few_meas,cent_kic_pos,cent_nofits,cent_resolved_offset,cent_saturated,cent_uncertain,cent_unresolved_offset,centroid_signif_uncertain,clear_apo,crowded_diff,cyg,deep_v_shaped,depth_oddeven_alt,depth_oddeven_dv,dv_sig_pri_minus_sig_pos_too_low,dv_sig_pri_over_fred_too_low,ephem_match,eyeball,fit_failed,halo_ghost,has_sec_tce,inconsistent_trans,indiv_trans_chases,indiv_trans_chases_marshall,indiv_trans_chases_marshall_skye,indiv_trans_chases_marshall_zuma,indiv_trans_chases_skye,indiv_trans_marshall,indiv_trans_marshall_skye,indiv_trans_marshall_zuma,indiv_trans_rubble,indiv_trans_rubble_marshall_skye,indiv_trans_rubble_skye,indiv_trans_rubble_skye_zuma,indiv_trans_rubble_skye_zuma_tracker,indiv_trans_skye,indiv_trans_skye_zuma,indiv_trans_skye_zuma_tracker,indiv_trans_zuma,invert_diff,is_sec_tce,kic_offset,lpp_alt,lpp_alt_too_high,lpp_dv,lpp_dv_too_high,lyr,marshall_fail,mod_nonuniq_alt,mod_nonuniq_dv,mod_oddeven_alt,mod_oddeven_dv,mod_pos_alt,mod_pos_dv,mod_sec_alt,mod_sec_dv,mod_ter_alt,mod_ter_dv,no_comment,other_tce_at_same_period_diff_epoch,parent_is_002305372,parent_is_002449084,parent_is_003352751,parent_is_003858884,parent_is_004482641,parent_is_005024292,parent_is_005036538,parent_is_005343976,parent_is_005471619,parent_is_005513861,parent_is_006367628,parent_is_007258889,parent_is_007598128,parent_is_008265951,parent_is_008380743,parent_is_009541127,parent_is_009777062,parent_is_010485137,parent_is_010858720,parent_is_012004679,parent_is_3597,parent_is_3895,parent_is_4673,parent_is_489,parent_is_5335,parent_is_970,parent_is_fl,parent_is_rr,parent_is_uz,parent_is_v2277,parent_is_v380,parent_is_v850,period_alias_alt,period_alias_dv,period_alias_in_alt_data_seen_at_3,period_alias_in_dv_data_seen_at_3,planet_in_star,planet_occult_alt,planet_occult_dv,planet_period_is_half_alt,planet_period_is_half_dv,pri,resid_of_prev_tce,residual_tce,same_ntl_period,same_p_as_prev_ntl_tce,saturated,seasonal_depth_alt,seasonal_depth_diffs_in_alt,seasonal_depth_dv,sec,sig_sec_in_alt_model_shift,sig_sec_in_dv_model_shift,signif_offset,sweet_eb,sweet_ntl,too_few_centroids,too_few_quarters,trans_gapped,transits_not_consistent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.504692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.595746,0.624797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
# Sum the TF-IDF values for each comment to get a single value per comment
comments_df['aggregated_tfidf'] = comments_df.sum(axis=1)

# add aggregated_tfidf back into X to replace koi_comment with numeric values
X['koi_comment_vectorized'] = comments_df['aggregated_tfidf']

# Verify the result
X[['koi_comment_vectorized', 'koi_comment']].head(10)


Unnamed: 0,koi_comment_vectorized,koi_comment
0,1.0,NO_COMMENT
1,1.0,NO_COMMENT
2,1.0,DEEP_V_SHAPED
3,1.725236,MOD_ODDEVEN_DV---MOD_ODDEVEN_ALT---DEEP_V_SHAPED
4,1.0,NO_COMMENT
5,1.0,NO_COMMENT
6,1.0,NO_COMMENT
7,1.0,NO_COMMENT
8,1.998752,MOD_SEC_DV---MOD_SEC_ALT---HAS_SEC_TCE---CENT_...
9,1.0,NO_COMMENT


In [149]:
#drop koi_comment not that it has been replaced
X = X.drop(columns=['koi_comment'])
X.head(3)

Unnamed: 0,koi_vet_stat,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_time0,koi_time0_err1,koi_time0_err2,koi_eccen,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_ror,koi_ror_err1,koi_ror_err2,koi_srho,koi_srho_err1,koi_srho_err2,koi_fittype,koi_prad,koi_prad_err1,koi_prad_err2,koi_sma,koi_incl,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_dor,koi_dor_err1,koi_dor_err2,koi_limbdark_mod,koi_ldm_coeff4,koi_ldm_coeff3,koi_ldm_coeff2,koi_ldm_coeff1,koi_parm_prov,koi_max_sngle_ev,koi_max_mult_ev,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_tce_delivname,koi_quarters,koi_bin_oedp_sig,koi_trans_mod,koi_datalink_dvr,koi_datalink_dvs,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_smet,koi_smet_err1,koi_smet_err2,koi_srad,koi_srad_err1,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sparprov,ra,dec,koi_kepmag,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sra_err,koi_fwm_sdec,koi_fwm_sdec_err,koi_fwm_srao,koi_fwm_srao_err,koi_fwm_sdeco,koi_fwm_sdeco_err,koi_fwm_prao,koi_fwm_prao_err,koi_fwm_pdeco,koi_fwm_pdeco_err,koi_dicco_mra,koi_dicco_mra_err,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err,koi_comment_vectorized
0,Done,1.0,0,0,0,0,q1_q17_dr25_sup_koi,9.488036,2.8e-05,-2.8e-05,170.53875,0.00216,-0.00216,2455003.539,0.00216,-0.00216,0.0,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,616.0,19.5,-19.5,0.022344,0.000832,-0.000528,3.20796,0.33173,-1.09986,LS+MCMC,2.26,0.26,-0.15,0.0853,89.66,793.0,93.59,29.45,-16.65,24.81,2.6,-2.6,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,5.135849,28.47082,35.8,2,142.0,1.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.6864,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-001-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.002,19.462294,1.4e-05,48.14191,0.00013,0.43,0.51,0.94,0.48,-0.0002,0.00032,-0.00055,0.00031,-0.01,0.13,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16,1.0
1,Done,0.969,0,0,0,0,q1_q17_dr25_sup_koi,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,2454995.514,0.00352,-0.00352,0.0,0.586,0.059,-0.443,4.507,0.116,-0.116,875.0,35.5,-35.5,0.027954,0.00908,-0.00135,3.02368,2.20489,-2.49638,LS+MCMC,2.83,0.32,-0.19,0.2734,89.57,443.0,9.11,2.87,-1.62,77.9,28.4,-28.4,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2291,0.4603,q1_q17_dr25_koi,7.027669,20.109507,25.8,2,25.0,2.0,q1_q17_dr25_tce,11111111111111111000000000000000,0.0023,Mandel and Agol (2002 ApJ 580 171),010/010797/010797460/dv/kplr010797460-20160209...,010/010797/010797460/dv/kplr010797460-002-2016...,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.14,0.15,-0.15,0.927,0.105,-0.061,0.919,0.052,-0.046,q1_q17_dr25_stellar,291.93423,48.141651,15.347,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.003,19.462265,2e-05,48.14199,0.00019,-0.63,0.72,1.23,0.68,0.00066,0.00065,-0.00105,0.00063,0.39,0.36,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45,1.0
2,Done,0.0,0,0,0,0,q1_q17_dr25_sup_koi,19.89914,1.5e-05,-1.5e-05,175.850252,0.000581,-0.000581,2455008.85,0.000581,-0.000581,0.0,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10800.0,171.0,-171.0,0.154046,5.03,-0.0422,7.29555,35.03293,-2.75453,LS+MCMC,14.6,3.92,-1.31,0.1419,88.96,638.0,39.3,31.04,-10.49,53.5,25.7,-25.7,Claret (2011 A&A 529 75) ATLAS LS,0.0,0.0,0.2711,0.3858,q1_q17_dr25_koi,37.159767,187.4491,76.3,1,56.0,1.0,q1_q17_dr25_tce,11111101110111011000000000000000,0.6624,Mandel and Agol (2002 ApJ 580 171),010/010811/010811496/dv/kplr010811496-20160209...,010/010811/010811496/dv/kplr010811496-001-2016...,5853.0,158.0,-176.0,4.544,0.044,-0.176,-0.18,0.3,-0.3,0.868,0.233,-0.078,0.961,0.11,-0.121,q1_q17_dr25_stellar,297.00482,48.134129,15.436,15.943,15.39,15.22,15.166,14.254,13.9,13.826,0.278,19.800321,2e-06,48.13412,2e-05,-0.021,0.069,-0.038,0.071,0.0007,0.0024,0.0006,0.0034,-0.025,0.07,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074,1.0


In [150]:
X.dtypes

koi_vet_stat               object
koi_score                 float64
koi_fpflag_nt               int64
koi_fpflag_ss               int64
koi_fpflag_co               int64
koi_fpflag_ec               int64
koi_disp_prov              object
koi_period                float64
koi_period_err1           float64
koi_period_err2           float64
koi_time0bk               float64
koi_time0bk_err1          float64
koi_time0bk_err2          float64
koi_time0                 float64
koi_time0_err1            float64
koi_time0_err2            float64
koi_eccen                 float64
koi_impact                float64
koi_impact_err1           float64
koi_impact_err2           float64
koi_duration              float64
koi_duration_err1         float64
koi_duration_err2         float64
koi_depth                 float64
koi_depth_err1            float64
koi_depth_err2            float64
koi_ror                   float64
koi_ror_err1              float64
koi_ror_err2              float64
koi_srho      