In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
data = pd.read_csv('exoplanet_data.csv')
df = data.copy(deep=True)

In [4]:
df.sample(10)

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6574,9363944,K05660.01,,CANDIDATE,CANDIDATE,,0,0,0,0,402.20969,369.5086,0.3779,5.298,472.5,10.03,452.0,9.86,9.5,1.0,q1_q16_tce,4961.0,3.131,4.649,299.03528,45.869289,13.66
9563,10156110,K07989.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,0,1,1,4.856035,135.9933,0.134,3.078,76.7,1.05,1266.0,607.42,8.2,1.0,q1_q17_dr25_tce,6469.0,4.385,1.193,297.00977,47.121021,14.826
2414,9837661,K02715.02,Kepler-1321 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,2.226496,133.37195,0.048,1.7073,1839.1,2.1,697.0,55.76,36.0,2.0,q1_q17_dr25_tce,3640.0,4.744,0.503,294.52859,46.640968,16.835
2729,11508644,K03101.01,Kepler-1426 b,CONFIRMED,CANDIDATE,0.998,0,0,0,0,14.256327,133.40971,0.894,4.796,206.1,1.73,797.0,95.22,14.3,1.0,q1_q17_dr25_tce,6140.0,4.455,1.027,292.54575,49.416969,15.192
1789,8973129,K02286.01,Kepler-1176 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,24.173824,135.62471,0.251,3.056,532.6,2.21,595.0,29.69,21.5,1.0,q1_q17_dr25_tce,5468.0,4.427,0.968,297.91974,45.274158,15.056
5738,9602562,K03985.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,0,1,1,3.556536,133.5172,0.712,8.7,202.5,1.7,1272.0,619.57,23.1,1.0,q1_q17_dr25_tce,5950.0,4.395,1.114,297.1322,46.256489,14.377
8972,3766353,K06359.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,2.666962,133.719065,1.276,2.4774,7583.4,40.68,1469.0,1104.26,249.5,1.0,q1_q17_dr25_tce,6609.0,4.484,0.974,296.05377,38.89426,13.968
547,6047498,K01013.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,0.518727,133.676312,1.199,1.252,714.1,19.07,1965.0,3537.27,93.3,1.0,q1_q17_dr25_tce,5572.0,4.597,0.762,293.38556,41.349819,15.348
6073,9605552,K03102.01,,CANDIDATE,CANDIDATE,,0,0,0,0,9.321084,138.09946,0.0815,1.81,294.2,0.61,394.0,5.67,6.1,,,3773.0,4.925,0.35,297.96048,46.227921,15.983
1868,6666233,K02306.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,0.512407,132.240449,0.301,1.0579,329.2,1.03,1255.0,583.45,46.1,1.0,q1_q17_dr25_tce,3862.0,4.69,0.563,282.28439,42.15419,14.78


In [5]:
df = df.drop('kepler_name', axis=1).dropna()

In [6]:
df.shape

(7994, 26)

In [7]:
X = df.select_dtypes(include=['float64'])

In [8]:
X.sample(5)

Unnamed: 0,koi_score,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
5312,0.0,4.045063,132.232226,0.529,3.22222,360180.0,67.94,1143.0,402.77,483.9,1.0,5780.0,4.438,1.0,294.58798,40.167068,17.705
2029,1.0,2.668313,131.68823,0.333,2.4141,266.2,1.48,1252.0,581.38,23.3,1.0,5806.0,4.528,0.899,296.07568,45.62706,15.553
5018,0.0,157.322639,222.900363,0.888,21.9232,153930.0,56.66,351.0,3.61,1386.8,1.0,6109.0,4.472,0.989,284.51367,45.1203,15.291
720,0.0,0.782046,134.0214,1.244,1.7945,16185.0,45.05,2070.0,4332.1,272.3,1.0,6113.0,4.463,0.995,290.26767,44.712849,14.912
2876,0.962,27.943024,136.8697,0.052,6.125,308.6,2.21,718.0,62.74,12.3,1.0,6468.0,4.363,1.25,292.80722,38.884232,15.243


In [9]:
labels = df['koi_pdisposition']

In [10]:
y = labels.replace({'FALSE POSITIVE': 0, 'CANDIDATE': 1})

In [11]:
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [13]:
pred = lr_model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, pred)*100:0.2f}%')

Accuracy: 95.31%


In [14]:
print('Top 5 most predictive features:')

coefs = lr_model.coef_.flatten()
weights = np.abs(coefs)

weights_df = pd.DataFrame(weights, columns=['weight'])
weights_df.sort_values(by='weight', ascending=False, inplace=True)
weights_df.reset_index(inplace=True)

for index in np.array(weights_df['index'])[:5]:
    print(f'{X.columns[index]} : {coefs[index]:0.4f}')

print('(refer to README for feature descriptions)')

Top 5 most predictive features:
koi_score : 2.9879
koi_depth : -1.9462
koi_prad : -1.9185
koi_teq : -1.1261
koi_impact : -0.7299
(refer to README for feature descriptions)
