In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
#read the dataset
df_nasa_data = pd.read_csv("exoplanets.csv", header=0, delimiter=',')
print(df_nasa_data.shape)
df_nasa_data.head()

(9564, 49)


Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
#make target variable numeric
df_nasa_data['ExoplanetCandidate'] = df_nasa_data['koi_pdisposition'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df_nasa_data.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [4]:
#drop for prediction useless columns (names, etc) useless columns
df_nasa_data.drop(columns=['kepid','kepoi_name','kepler_name',
                 'koi_disposition','koi_tce_delivname','koi_pdisposition'], inplace=True)

In [5]:
#Number of Null values in the columns
df_nasa_data.isnull().sum()

koi_score             1510
koi_fpflag_nt            0
koi_fpflag_ss            0
koi_fpflag_co            0
koi_fpflag_ec            0
koi_period               0
koi_period_err1        454
koi_period_err2        454
koi_time0bk              0
koi_time0bk_err1       454
koi_time0bk_err2       454
koi_impact             363
koi_impact_err1        454
koi_impact_err2        454
koi_duration             0
koi_duration_err1      454
koi_duration_err2      454
koi_depth              363
koi_depth_err1         454
koi_depth_err2         454
koi_prad               363
koi_prad_err1          363
koi_prad_err2          363
koi_teq                363
koi_teq_err1          9564
koi_teq_err2          9564
koi_insol              321
koi_insol_err1         321
koi_insol_err2         321
koi_model_snr          363
koi_tce_plnt_num       346
koi_steff              363
koi_steff_err1         468
koi_steff_err2         483
koi_slogg              363
koi_slogg_err1         468
koi_slogg_err2         468
k

In [6]:
#koi_teq_err1 and koi_teq_err2 always zero just drop the columns
df_nasa_data.drop(columns=['koi_teq_err1','koi_teq_err2'], inplace=True)

In [7]:
#Check how many null values are in the rows

#find columns with multiple null values
null_counts = df_nasa_data.isnull().sum(axis=1)

# count the number of rows with multiple null values
null_values_row = null_counts.value_counts()
null_values_row

0     7803
1      956
2      248
30     213
31      91
10      89
6       85
26      42
29      17
7       11
8        7
16       2
dtype: int64

In [8]:
#We have rows with up to 31 null values 
#drop all rows with 5 or more null values as otherwise we woul end up with a lot of imputed values which could be missleading
#find columns with multiple null values

# Filter the rows where the null count is less than or equal to 5
rows_to_keep = null_counts[null_counts < 5].index

# Drop the rows where the null count is greater than 5
df_nasa_data_drop = df_nasa_data.loc[rows_to_keep]
print(df_nasa_data_drop.shape)
df_nasa_data_drop.head()

(9007, 42)


Unnamed: 0,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate
0,1.0,0,0,0,0,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,0.969,0,0,0,0,54.418383,0.000248,-0.000248,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,0.0,0,0,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,1.0,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [9]:
df_nasa_data_drop.isnull().sum()

koi_score             1189
koi_fpflag_nt            0
koi_fpflag_ss            0
koi_fpflag_co            0
koi_fpflag_ec            0
koi_period               0
koi_period_err1          0
koi_period_err2          0
koi_time0bk              0
koi_time0bk_err1         0
koi_time0bk_err2         0
koi_impact               0
koi_impact_err1          0
koi_impact_err2          0
koi_duration             0
koi_duration_err1        0
koi_duration_err2        0
koi_depth                0
koi_depth_err1           0
koi_depth_err2           0
koi_prad                 0
koi_prad_err1            0
koi_prad_err2            0
koi_teq                  0
koi_insol                0
koi_insol_err1           0
koi_insol_err2           0
koi_model_snr            0
koi_tce_plnt_num       248
koi_steff                0
koi_steff_err1           0
koi_steff_err2          15
koi_slogg                0
koi_slogg_err1           0
koi_slogg_err2           0
koi_srad                 0
koi_srad_err1            0
k

In [16]:
#use knn imputation for koi_tce_plnt_num and koi_steff_err2 

df_imputed = df_nasa_data_drop.drop(columns=['koi_score'])
columns_with_missing_values = ['koi_tce_plnt_num', 'koi_steff_err2']
imputer = KNNImputer(n_neighbors=3)

df_imputed[columns_with_missing_values] = imputer.fit_transform(df_imputed[columns_with_missing_values])

Die KOI Score ist verschieden zu interpretieren je nachdem der Exoplanet als canditate oder false positive eingestuft ist
daher knn pro klasse durchführen.

(KOI Score feld beschreibung)
A value between 0 and 1 that indicates the confidence in the KOI disposition. For CANDIDATEs, a higher value indicates 
more confidence in its disposition, while for FALSE POSITIVEs, a higher value indicates less confidence in that disposition. 
The value is calculated from a Monte Carlo technique such that the score's value is equivalent to the frction of iterations 
where the Robovetter yields a disposition of CANDIDATE.

In [15]:
class_0_df = df_nasa_data_drop[df_nasa_data_drop['ExoplanetCandidate'] == 0]
class_1_df = df_nasa_data_drop[df_nasa_data_drop['ExoplanetCandidate'] == 1]

#drop the previously imputed columns
class_0_df.drop(columns=['koi_tce_plnt_num', 'koi_steff_err2'])
class_1_df.drop(columns=['koi_tce_plnt_num', 'koi_steff_err2'])

# Specify the columns with missing values
columns_with_missing_values = ['koi_score']

# Create the KNN imputer object
imputer = KNNImputer(n_neighbors=3)

# Impute the missing values
class_0_df[columns_with_missing_values] = imputer.fit_transform(class_0_df[columns_with_missing_values])

# Impute the missing values
class_1_df[columns_with_missing_values] = imputer.fit_transform(class_1_df[columns_with_missing_values])

df_koi_score = pd.concat([class_0_df, class_1_df])

df_koi_score.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Unnamed: 0,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate
0,1.0,0,0,0,0,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,0.969,0,0,0,0,54.418383,0.000248,-0.000248,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,0.0,0,0,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,0.0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,1.0,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [19]:
df_imputed['koi_score'] = df_koi_score.koi_score
df_imputed.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,ExoplanetCandidate,koi_score
0,0,0,0,0,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,-0.00216,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,1.0
1,0,0,0,0,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,0.969
2,0,0,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1,0.0
3,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0,0.0
4,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1,1.0


In [20]:
#no missing values left 
df_imputed.isnull().sum()

koi_fpflag_nt         0
koi_fpflag_ss         0
koi_fpflag_co         0
koi_fpflag_ec         0
koi_period            0
koi_period_err1       0
koi_period_err2       0
koi_time0bk           0
koi_time0bk_err1      0
koi_time0bk_err2      0
koi_impact            0
koi_impact_err1       0
koi_impact_err2       0
koi_duration          0
koi_duration_err1     0
koi_duration_err2     0
koi_depth             0
koi_depth_err1        0
koi_depth_err2        0
koi_prad              0
koi_prad_err1         0
koi_prad_err2         0
koi_teq               0
koi_insol             0
koi_insol_err1        0
koi_insol_err2        0
koi_model_snr         0
koi_tce_plnt_num      0
koi_steff             0
koi_steff_err1        0
koi_steff_err2        0
koi_slogg             0
koi_slogg_err1        0
koi_slogg_err2        0
koi_srad              0
koi_srad_err1         0
koi_srad_err2         0
ra                    0
dec                   0
koi_kepmag            0
ExoplanetCandidate    0
koi_score       

In [22]:
df_imputed.reindex()
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9007 entries, 0 to 9563
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   koi_fpflag_nt       9007 non-null   int64  
 1   koi_fpflag_ss       9007 non-null   int64  
 2   koi_fpflag_co       9007 non-null   int64  
 3   koi_fpflag_ec       9007 non-null   int64  
 4   koi_period          9007 non-null   float64
 5   koi_period_err1     9007 non-null   float64
 6   koi_period_err2     9007 non-null   float64
 7   koi_time0bk         9007 non-null   float64
 8   koi_time0bk_err1    9007 non-null   float64
 9   koi_time0bk_err2    9007 non-null   float64
 10  koi_impact          9007 non-null   float64
 11  koi_impact_err1     9007 non-null   float64
 12  koi_impact_err2     9007 non-null   float64
 13  koi_duration        9007 non-null   float64
 14  koi_duration_err1   9007 non-null   float64
 15  koi_duration_err2   9007 non-null   float64
 16  koi_de