## Random Forest

#### Import Random Forest Algorithm for classification & Regression

In [12]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

print(RandomForestClassifier())
print(RandomForestRegressor())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [13]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=DataConversionWarning)

tr_features = pd.read_csv('../Resources/train_features.csv')
tr_labels = pd.read_csv('../Resources/train_labels.csv', header=None)
test_features = pd.read_csv('../Resources/test_features.csv')
test_labels = pd.read_csv('../Resources/test_labels.csv', header=None)
val_features= pd.read_csv('../Resources/val_features.csv')
val_labels= pd.read_csv('../Resources/val_labels.csv', header=None)

In [14]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(tr_features)

tr_features_scaled = X_scaler.transform(tr_features)
test_features_scaled = X_scaler.transform(test_features)
val_features_scaled = X_scaler.transform(val_features)

In [15]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [16]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features_scaled, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': None, 'n_estimators': 250}

0.744 (+/-0.059) for {'max_depth': 2, 'n_estimators': 5}
0.726 (+/-0.087) for {'max_depth': 2, 'n_estimators': 50}
0.696 (+/-0.039) for {'max_depth': 2, 'n_estimators': 250}
0.821 (+/-0.036) for {'max_depth': 4, 'n_estimators': 5}
0.853 (+/-0.013) for {'max_depth': 4, 'n_estimators': 50}
0.85 (+/-0.008) for {'max_depth': 4, 'n_estimators': 250}
0.86 (+/-0.03) for {'max_depth': 8, 'n_estimators': 5}
0.877 (+/-0.017) for {'max_depth': 8, 'n_estimators': 50}
0.881 (+/-0.019) for {'max_depth': 8, 'n_estimators': 250}
0.86 (+/-0.008) for {'max_depth': 16, 'n_estimators': 5}
0.881 (+/-0.023) for {'max_depth': 16, 'n_estimators': 50}
0.885 (+/-0.02) for {'max_depth': 16, 'n_estimators': 250}
0.857 (+/-0.043) for {'max_depth': 32, 'n_estimators': 5}
0.881 (+/-0.022) for {'max_depth': 32, 'n_estimators': 50}
0.886 (+/-0.019) for {'max_depth': 32, 'n_estimators': 250}
0.855 (+/-0.014) for {'max_depth': None, 'n_estimators': 5}
0.885 (+/-0.02

#### Write Model

In [17]:
joblib.dump(cv.best_estimator_,'../Saved_model/RF_model.pkl')

['../Saved_model/RF_model.pkl']

#### Prediction on test data

In [18]:
model_predict = joblib.load('../Saved_model/RF_model.pkl')

Predicted = model_predict.predict(test_features_scaled[:20])
Actual = (test_labels[:20]).values.ravel()

In [19]:
Compare_df = pd.DataFrame({
    "Predicted":Predicted,
    "Actual": Actual
})
Compare_df

Unnamed: 0,Predicted,Actual
0,2,2
1,0,0
2,2,2
3,0,0
4,1,1
5,2,2
6,1,1
7,1,0
8,2,2
9,2,2


#### Model validation

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred, average='weighted'), 3)
    recall = round(recall_score(labels, pred, average='weighted'), 3)
    print('{} -- Accuracy: {} / Precision: {} / recall: {} /Latency: {}ms'.format(name,
                                                                            accuracy,
                                                                            precision,
                                                                            recall,
                                                                            round((end - start)*1000, 1)))

In [21]:
evaluate_model('RandomForest Model', model_predict, val_features_scaled, val_labels)

RandomForest Model -- Accuracy: 0.9 / Precision: 0.898 / recall: 0.9 /Latency: 66.8ms


#### Identify important features

In [24]:
model = rf.fit(tr_features_scaled, tr_labels.values.ravel())

In [31]:
model.score(val_features_scaled, val_labels)

0.8862660944206009

In [30]:
sorted(zip(model.feature_importances_, tr_features.columns), reverse=True)

[(0.1140377052308948, 'koi_fpflag_co'),
 (0.1138772379000282, 'koi_fpflag_nt'),
 (0.09659282496352316, 'koi_fpflag_ss'),
 (0.05246118900252185, 'koi_prad'),
 (0.05122055567653152, 'koi_model_snr'),
 (0.039929815187422, 'koi_duration_err2'),
 (0.0355265776383385, 'koi_fpflag_ec'),
 (0.032584942404556734, 'koi_duration_err1'),
 (0.02779822528280117, 'koi_period'),
 (0.026666487656592186, 'koi_steff_err1'),
 (0.026013234156497922, 'koi_prad_err1'),
 (0.025410115170111196, 'koi_period_err1'),
 (0.022049492326514618, 'koi_duration'),
 (0.020795331693324044, 'koi_steff_err2'),
 (0.020170442529297276, 'koi_time0bk_err1'),
 (0.01735791893959158, 'koi_period_err2'),
 (0.017086671667242724, 'koi_impact'),
 (0.01694901574148288, 'koi_time0bk_err2'),
 (0.015801695160258273, 'koi_depth'),
 (0.015376275059825223, 'koi_insol'),
 (0.01526481680303845, 'koi_teq'),
 (0.0136673482729212, 'koi_insol_err2'),
 (0.013070354716246688, 'dec'),
 (0.012917627824695285, 'koi_slogg_err2'),
 (0.012362441436910124, 