## Random Forest

#### Import Random Forest Algorithm for classification & Regression

In [1]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

print(RandomForestClassifier())
print(RandomForestRegressor())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [2]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=DataConversionWarning)

tr_features = pd.read_csv('../Resources/train_features.csv')
tr_labels = pd.read_csv('../Resources/train_labels.csv', header=None)
test_features = pd.read_csv('../Resources/test_features.csv')
test_labels = pd.read_csv('../Resources/test_labels.csv', header=None)
val_features= pd.read_csv('../Resources/val_features.csv')
val_labels= pd.read_csv('../Resources/val_labels.csv', header=None)

In [3]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(tr_features)

tr_features_scaled = X_scaler.transform(tr_features)
test_features_scaled = X_scaler.transform(test_features)
val_features_scaled = X_scaler.transform(val_features)

In [4]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [5]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features_scaled, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 16, 'n_estimators': 50}

0.691 (+/-0.033) for {'max_depth': 2, 'n_estimators': 5}
0.725 (+/-0.095) for {'max_depth': 2, 'n_estimators': 50}
0.695 (+/-0.052) for {'max_depth': 2, 'n_estimators': 250}
0.837 (+/-0.017) for {'max_depth': 4, 'n_estimators': 5}
0.846 (+/-0.011) for {'max_depth': 4, 'n_estimators': 50}
0.847 (+/-0.01) for {'max_depth': 4, 'n_estimators': 250}
0.858 (+/-0.014) for {'max_depth': 8, 'n_estimators': 5}
0.88 (+/-0.018) for {'max_depth': 8, 'n_estimators': 50}
0.879 (+/-0.019) for {'max_depth': 8, 'n_estimators': 250}
0.856 (+/-0.042) for {'max_depth': 16, 'n_estimators': 5}
0.889 (+/-0.019) for {'max_depth': 16, 'n_estimators': 50}
0.888 (+/-0.022) for {'max_depth': 16, 'n_estimators': 250}
0.856 (+/-0.025) for {'max_depth': 32, 'n_estimators': 5}
0.882 (+/-0.018) for {'max_depth': 32, 'n_estimators': 50}
0.884 (+/-0.015) for {'max_depth': 32, 'n_estimators': 250}
0.855 (+/-0.014) for {'max_depth': None, 'n_estimators': 5}
0.887 (+/-0.02

#### Write Model

In [6]:
joblib.dump(cv.best_estimator_,'../Saved_model/RF_model.pkl')

['../Saved_model/RF_model.pkl']

#### Prediction on test data

In [7]:
model_predict = joblib.load('../Saved_model/RF_model.pkl')

Predicted = model_predict.predict(test_features_scaled[:20])
Actual = (test_labels[:20]).values.ravel()

In [8]:
Compare_df = pd.DataFrame({
    "Predicted":Predicted,
    "Actual": Actual
})
Compare_df

Unnamed: 0,Predicted,Actual
0,2,2
1,0,0
2,2,2
3,0,0
4,1,1
5,2,2
6,1,1
7,0,0
8,2,2
9,2,2


#### Model validation

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred, average='weighted'), 3)
    recall = round(recall_score(labels, pred, average='weighted'), 3)
    print('{} -- Accuracy: {} / Precision: {} / recall: {} /Latency: {}ms'.format(name,
                                                                            accuracy,
                                                                            precision,
                                                                            recall,
                                                                            round((end - start)*1000, 1)))

In [10]:
evaluate_model('RandomForest Model', model_predict, val_features_scaled, val_labels)

RandomForest Model -- Accuracy: 0.894 / Precision: 0.892 / recall: 0.894 /Latency: 18.9ms


#### Identify important features

In [11]:
model = rf.fit(tr_features_scaled, tr_labels.values.ravel())

In [12]:
model.score(val_features_scaled, val_labels)

0.8848354792560801

In [13]:
sorted(zip(model_predict.feature_importances_, tr_features.columns), reverse=True)

[(0.10885209884705363, 'koi_fpflag_co'),
 (0.10164997383787039, 'koi_fpflag_nt'),
 (0.07802953769391892, 'koi_fpflag_ss'),
 (0.05396039188006713, 'koi_model_snr'),
 (0.04102349400398717, 'koi_duration_err1'),
 (0.038524083624572894, 'koi_prad'),
 (0.03842262698773389, 'koi_steff_err1'),
 (0.036718760122182684, 'koi_fpflag_ec'),
 (0.03589464152241931, 'koi_prad_err2'),
 (0.03165683251171419, 'koi_prad_err1'),
 (0.026124045035737665, 'koi_steff_err2'),
 (0.024733907874489574, 'koi_duration_err2'),
 (0.023601615500605962, 'koi_insol_err1'),
 (0.023372521691035413, 'koi_depth'),
 (0.02150565301292849, 'koi_period'),
 (0.021474190425928616, 'koi_time0bk_err1'),
 (0.0213252067302916, 'koi_period_err1'),
 (0.021072732903001445, 'koi_impact'),
 (0.020591660537256834, 'koi_time0bk_err2'),
 (0.0198449298862353, 'koi_duration'),
 (0.016508133866815437, 'koi_period_err2'),
 (0.01460164958829317, 'koi_insol'),
 (0.014084407129838002, 'koi_teq'),
 (0.013565306974938398, 'koi_time0bk'),
 (0.012588086