# Random Forest Model

In [1]:
import pandas as pd

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import warnings 
warnings.filterwarnings("ignore")

### Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

In [3]:
# create a test train split
y = df['koi_disposition']
X = df.drop(columns=['koi_disposition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

### Pre-Processing

Scale the data using the MinMaxScaler and perform some feature selection

In [4]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Train the Model

In [5]:
# create & train a random forest classifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
print(f"Training Data Score: {model_rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9952317375548351
Testing Data Score: 0.8867276887871853


In [7]:
# test feature importances
importances = model_rf.feature_importances_
sorted(zip(importances, df.columns), reverse=True)

[(0.09727955950849897, 'koi_disposition'),
 (0.08018576775351446, 'koi_fpflag_ss'),
 (0.05982672719280262, 'koi_fpflag_nt'),
 (0.05527514444765769, 'koi_insol_err2'),
 (0.04925683106300732, 'koi_steff'),
 (0.048304853632665935, 'koi_duration'),
 (0.04607426066948789, 'koi_depth_err2'),
 (0.038448951607367624, 'koi_prad'),
 (0.03371498529871063, 'koi_duration_err1'),
 (0.03239214963497441, 'koi_fpflag_co'),
 (0.02991747638325009, 'koi_duration_err2'),
 (0.028061171441525867, 'koi_insol_err1'),
 (0.02651874225621989, 'koi_steff_err1'),
 (0.025468948617972376, 'koi_fpflag_ec'),
 (0.025275302066813506, 'koi_impact_err2'),
 (0.02406944594821836, 'koi_prad_err1'),
 (0.023649127263512267, 'koi_period'),
 (0.020763496160976856, 'koi_insol'),
 (0.01681611133966882, 'koi_time0bk_err2'),
 (0.01575533868202909, 'koi_time0bk'),
 (0.015252549664868576, 'koi_time0bk_err1'),
 (0.01462606325594352, 'koi_srad_err2'),
 (0.014496490002480041, 'koi_period_err2'),
 (0.014471023301300507, 'koi_period_err1'),

### Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [8]:
param_grid_rf = {
    'n_estimators': [50,75,100], 
    'max_depth': [100,125,150]}
grid_rf = GridSearchCV(model_rf, param_grid_rf, verbose=0)

In [9]:
# Train the model with GridSearch
grid_rf.fit(X_train_scaled, y_train)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [10]:
print(grid_rf.best_params_)
print(grid_rf.best_score_)

{'max_depth': 125, 'n_estimators': 100}
0.8889948502765592


In [11]:
# retune original model with best params
model_rf = RandomForestClassifier(max_depth=100, n_estimators=50)
model_rf.fit(X_train_scaled, y_train)

print(f"Training Data Score: {model_rf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_rf.score(X_test_scaled, y_test)}")

Training Data Score: 0.9998092695021934
Testing Data Score: 0.9038901601830663
