In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 500)

k2_df = pd.read_csv('data/K2_2024.06.06_20.23.11.csv', comment='#')
kep_df = pd.read_csv('data/KOI_2024.06.06_20.22.41.csv', comment='#')

In [48]:
print(k2_df.shape)
print(k2_df['disposition'].value_counts())

(3839, 16)
disposition
CONFIRMED         2156
CANDIDATE         1372
FALSE POSITIVE     293
REFUTED             18
Name: count, dtype: int64


In [49]:
k2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3839 entries, 0 to 3838
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pl_name      3839 non-null   object 
 1   hostname     3839 non-null   object 
 2   disposition  3839 non-null   object 
 3   pl_orbper    3800 non-null   float64
 4   pl_orbsmax   777 non-null    float64
 5   pl_rade      3030 non-null   float64
 6   pl_insol     519 non-null    float64
 7   pl_eqt       709 non-null    float64
 8   pl_trandur   2718 non-null   float64
 9   st_teff      2731 non-null   float64
 10  st_rad       3715 non-null   float64
 11  st_mass      1968 non-null   float64
 12  st_met       1568 non-null   float64
 13  st_logg      2204 non-null   float64
 14  sy_dist      3716 non-null   float64
 15  sy_kepmag    3824 non-null   float64
dtypes: float64(13), object(3)
memory usage: 480.0+ KB


In [50]:
print(kep_df.shape)
print(kep_df['koi_disposition'].value_counts())

(9564, 16)
koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2743
CANDIDATE         1982
Name: count, dtype: int64


In [51]:
kep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   kepid            9564 non-null   int64  
 1   kepoi_name       9564 non-null   object 
 2   kepler_name      2745 non-null   object 
 3   koi_disposition  9564 non-null   object 
 4   koi_period       9564 non-null   float64
 5   koi_duration     9564 non-null   float64
 6   koi_prad         9201 non-null   float64
 7   koi_sma          9201 non-null   float64
 8   koi_teq          9201 non-null   float64
 9   koi_insol        9243 non-null   float64
 10  koi_steff        9201 non-null   float64
 11  koi_slogg        9201 non-null   float64
 12  koi_smet         9178 non-null   float64
 13  koi_srad         9201 non-null   float64
 14  koi_smass        9201 non-null   float64
 15  koi_kepmag       9563 non-null   float64
dtypes: float64(12), int64(1), object(3)
memory usage: 1.2+ MB


In [52]:
column_mapping = {
    #'pl_name': 'kepler_name', # probably not useful
    'disposition': 'koi_disposition', # target, status of the object
    'pl_orbper': 'koi_period', # orbital period
	'pl_orbsmax': 'koi_sma', # semi-major axis
    'pl_rade': 'koi_prad', # planet radius
    'pl_trandur': 'koi_duration', # transit duration
    'pl_insol': 'koi_insol', # insolation flux
    'pl_eqt': 'koi_teq', # equilibrium temperature
    'sy_kepmag': 'koi_kepmag', # Kepler magnitude
    'st_rad': 'koi_srad', # stellar radius
    'st_teff': 'koi_steff', # stellar effective temperature
    'st_logg': 'koi_slogg', # stellar surface gravity
	'st_mass': 'koi_smass', # stellar mass
	'st_met': 'koi_smet', # stellar metallicity
}

new_column_names = {
    #'pl_name': 'planet_name',
    'disposition': 'status',
    'pl_orbper': 'orbital_period',
    'pl_orbsmax': 'semi_major_axis',
    'pl_rade': 'planet_radius',
    'pl_trandur': 'transit_duration',
    'pl_insol': 'insolation_flux',
    'pl_eqt': 'equilibrium_temperature',
    'sy_kepmag': 'kepler_magnitude',
    'st_rad': 'stellar_radius',
    'st_teff': 'stellar_effective_temperature',
    'st_logg': 'stellar_surface_gravity',
    'st_mass': 'stellar_mass',
    'st_met': 'stellar_metallicity'
}

In [53]:
# columns from k2_df
k2_columns = list(column_mapping.keys())
k2_df_selected = k2_df[k2_columns]
print(k2_df_selected.shape)
print(k2_df_selected.head(3))

# columns from kep_df
kep_columns = list(column_mapping.values())
kep_df_selected = kep_df[kep_columns]
print(kep_df_selected.shape)
print(kep_df_selected.head(3))

kep_df_selected.columns = k2_df_selected.columns

combined_df = pd.concat([k2_df_selected, kep_df_selected], axis=0, ignore_index=True)
combined_df = combined_df.rename(columns=new_column_names)
print(combined_df.shape)
print(combined_df.head(3))

(3839, 13)
  disposition  pl_orbper  pl_orbsmax  pl_rade  pl_trandur  pl_insol  pl_eqt  \
0   CONFIRMED   0.759976         NaN   12.547    0.974735   9668.66  2529.0   
1   CANDIDATE   1.152400      0.0125    0.675         NaN   8270.03     NaN   
2   CONFIRMED   0.719571         NaN    1.611    1.757522   6635.86  2302.0   

   sy_kepmag  st_rad  st_teff  st_logg  st_mass  st_met  
0     12.310    1.66   5690.2     4.00      NaN     NaN  
1     12.302    1.06   5992.0     4.39     0.99  -0.095  
2      9.438    1.26   5910.0     4.27      NaN     NaN  
(9564, 13)
  koi_disposition  koi_period  koi_sma  koi_prad  koi_duration  koi_insol  \
0       CONFIRMED    9.488036   0.0853      2.26        2.9575      93.59   
1       CONFIRMED   54.418383   0.2734      2.83        4.5070       9.11   
2       CANDIDATE   19.899140   0.1419     14.60        1.7822      39.30   

   koi_teq  koi_kepmag  koi_srad  koi_steff  koi_slogg  koi_smass  koi_smet  
0    793.0      15.347     0.927     5455.

todo

In [54]:
disposition_counts = combined_df['status'].value_counts()
print(disposition_counts)

status
FALSE POSITIVE    5132
CONFIRMED         4899
CANDIDATE         3354
REFUTED             18
Name: count, dtype: int64


In [55]:
df = combined_df[combined_df['status'] != 'CANDIDATE']
df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})

disposition_counts = df['status'].value_counts()
print(disposition_counts)

status
0    5150
1    4899
Name: count, dtype: int64


  df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['status'] = df['status'].replace({'FALSE POSITIVE': 0, 'REFUTED': 0, 'CONFIRMED': 1})


In [56]:
# count null values
null_counts = df.isnull().sum()
print(null_counts)

status                              0
orbital_period                     19
semi_major_axis                  1981
planet_radius                     800
transit_duration                  916
insolation_flux                  2219
equilibrium_temperature          2100
kepler_magnitude                    8
stellar_radius                    353
stellar_effective_temperature     784
stellar_surface_gravity          1131
stellar_mass                     1128
stellar_metallicity              1517
dtype: int64


In [57]:
# drop rows with null values
df = df.dropna()

disposition_counts = df['status'].value_counts()
print(disposition_counts)

status
0    4561
1    2825
Name: count, dtype: int64


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7386 entries, 8 to 13402
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   status                         7386 non-null   int64  
 1   orbital_period                 7386 non-null   float64
 2   semi_major_axis                7386 non-null   float64
 3   planet_radius                  7386 non-null   float64
 4   transit_duration               7386 non-null   float64
 5   insolation_flux                7386 non-null   float64
 6   equilibrium_temperature        7386 non-null   float64
 7   kepler_magnitude               7386 non-null   float64
 8   stellar_radius                 7386 non-null   float64
 9   stellar_effective_temperature  7386 non-null   float64
 10  stellar_surface_gravity        7386 non-null   float64
 11  stellar_mass                   7386 non-null   float64
 12  stellar_metallicity            7386 non-null   float