In [109]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)

k2_df = pd.read_csv('k2.csv')
kep_df = pd.read_csv('kep.csv')

In [110]:
print(k2_df.shape)
print(k2_df['disposition'].value_counts())
k2_df.head(3)

(3839, 95)
disposition
CONFIRMED         2156
CANDIDATE         1372
FALSE POSITIVE     293
REFUTED             18
Name: count, dtype: int64


Unnamed: 0,loc_rowid,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,1,BD+20 594 b,BD+20 594,1,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2017-03,2018-04-26
1,2,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2016-10,2016-07-28
2,3,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,2018-04-25,2018-03,2018-02-15


In [111]:
print(kep_df.shape)
print(kep_df['koi_disposition'].value_counts())
kep_df.head(3)

(8054, 50)
koi_disposition
FALSE POSITIVE    3966
CONFIRMED         2726
CANDIDATE         1362
Name: count, dtype: int64


Unnamed: 0,loc_rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,11446443,K00001.01,Kepler-1 b,CONFIRMED,CANDIDATE,0.811,0,0,0,...,-78.0,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
1,2,10666592,K00002.01,Kepler-2 b,CONFIRMED,CANDIDATE,1.0,0,1,0,...,-89.0,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463
2,3,10748390,K00003.01,Kepler-3 b,CONFIRMED,CANDIDATE,0.913,0,0,0,...,-95.0,4.591,0.015,-0.036,0.763,0.028,-0.028,297.70935,48.080853,9.174


In [112]:
column_mapping = {
    #'pl_name': 'kepler_name', # probably not useful
    'disposition': 'koi_disposition', # target, status of the object
    'pl_orbper': 'koi_period', # orbital period
    'pl_rade': 'koi_prad', # planet radius
    'st_rad': 'koi_srad', # stellar radius
    'pl_insol': 'koi_insol', # insolation flux
    'pl_eqt': 'koi_teq', # equilibrium temperature
    'st_teff': 'koi_steff', # stellar effective temperature
    'st_logg': 'koi_slogg', # stellar surface gravity
}

new_column_names = {
    #'pl_name': 'planet_name',
    'disposition': 'status',
    'pl_orbper': 'orbital_period',
    'pl_rade': 'planet_radius',
    'st_rad': 'stellar_radius',
    'pl_insol': 'insolation_flux',
    'pl_eqt': 'equilibrium_temperature',
    'st_teff': 'stellar_effective_temperature',
    'st_logg': 'stellar_surface_gravity'
}

In [113]:
# columns from k2_df
k2_columns = list(column_mapping.keys())
k2_df_selected = k2_df[k2_columns]
print(k2_df_selected.shape)
print(k2_df_selected.head(3))

# columns from kep_df
kep_columns = list(column_mapping.values())
kep_df_selected = kep_df[kep_columns]
print(kep_df_selected.shape)
print(kep_df_selected.head(3))

kep_df_selected.columns = k2_df_selected.columns

combined_df = pd.concat([k2_df_selected, kep_df_selected], axis=0, ignore_index=True)
combined_df = combined_df.rename(columns=new_column_names)
print(combined_df.shape)
print(combined_df.head(3))

(3839, 8)
  disposition  pl_orbper  pl_rade  st_rad  pl_insol  pl_eqt  st_teff  st_logg
0   CONFIRMED  41.685500    2.578    1.08       NaN     NaN   5766.0     4.50
1   CONFIRMED  41.685500    2.230    0.93       NaN   546.0   5766.0     4.50
2   CONFIRMED  41.688644    2.355    0.96       NaN     NaN   5703.0     4.38
(8054, 8)
  koi_disposition  koi_period  koi_prad  koi_srad  koi_insol  koi_teq  \
0       CONFIRMED    2.470613     13.04     0.964     761.46   1339.0   
1       CONFIRMED    2.204735     16.10     1.952    4148.92   2048.0   
2       CONFIRMED    4.887803      4.82     0.763      96.67    800.0   

   koi_steff  koi_slogg  
0     5820.0      4.457  
1     6440.0      4.019  
2     4778.0      4.591  
(11893, 8)
      status  orbital_period  planet_radius  stellar_radius  insolation_flux  \
0  CONFIRMED       41.685500          2.578            1.08              NaN   
1  CONFIRMED       41.685500          2.230            0.93              NaN   
2  CONFIRMED       4

todo

In [114]:
disposition_counts = combined_df['status'].value_counts()
print(disposition_counts)

status
CONFIRMED         4882
FALSE POSITIVE    4259
CANDIDATE         2734
REFUTED             18
Name: count, dtype: int64


In [115]:
df = combined_df[combined_df['status'] != 'CANDIDATE']
df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})

disposition_counts = df['status'].value_counts()
print(disposition_counts)

status
1    4882
0    4277
Name: count, dtype: int64


  df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})


In [116]:
# count null values
null_counts = df.isnull().sum()
print(null_counts)

status                              0
orbital_period                     19
planet_radius                     596
stellar_radius                    149
insolation_flux                  2055
equilibrium_temperature          1896
stellar_effective_temperature     580
stellar_surface_gravity           927
dtype: int64


In [118]:
# drop rows with null values
df = df.dropna()

disposition_counts = df['status'].value_counts()
print(disposition_counts)

status
0    3916
1    2917
Name: count, dtype: int64
