In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Import Datasets

### 1. Confirmed exoplanets 
### 2. Confirmed and False Positives

In [None]:
base = pd.read_csv('planets_2019.11.14_08.09.11.csv')
base

In [None]:
baseCumulative = pd.read_csv('cumulative_2019.11.14_08.35.42.csv')
baseCumulative.head(10)

False positives can occur when: 
   - the KOI is in reality an eclipsing binary star, 
   - the Kepler light curve is contaminated by a background eclipsing binary, 
   - stellar variability is confused for coherent planetary transits, or 
   - instrumental artifacts are confused for coherent planetary transits. 

In [None]:
baseCumulative.columns

### Select features for training by dropping high correlation columns

In [None]:
selectC = baseCumulative.drop(columns=['loc_rowid',
                                      'kepler_name',
                                      'koi_disposition',
                                      'kepoi_name','koi_tce_delivname',
                                      'koi_score','koi_fpflag_nt'])

selectC.head(10)

### Binary Classification Task

In [None]:
selectC.koi_pdisposition.unique()

### Prepare Training and Test Set

In [None]:
X = selectC.loc[:, selectC.columns != 'koi_pdisposition']
y = selectC.loc[:, selectC.columns == 'koi_pdisposition']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Imputing missing values

In [None]:
selectC.isna().any()

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train_imp = imp.fit(X_train)
test_imp = imp.fit(X_test)

In [None]:
X_train_imputed = train_imp.transform(X_train)
X_test_imputed = test_imp.transform(X_test)

## Training a Random Forest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train_imputed, y_train)


## Measuring accuracy on the test set

In [None]:
clf.score(X_test_imputed, y_test)

## Finding and sorting the feature importance

In [None]:
feature_importances = pd.DataFrame(clf.feature_importances_,index = X_train.columns,columns=['importance']).sort_values('importance',ascending=False)

feature_importances