## Random Forest

#### Import Random Forest Algorithm for classification & Regression

In [1]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

print(RandomForestClassifier())
print(RandomForestRegressor())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


In [2]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=DataConversionWarning)

tr_features = pd.read_csv('../Resources/train_features.csv')
tr_labels = pd.read_csv('../Resources/train_labels.csv', header=None)
test_features = pd.read_csv('../Resources/test_features.csv')
test_labels = pd.read_csv('../Resources/test_labels.csv', header=None)
val_features= pd.read_csv('../Resources/val_features.csv')
val_labels= pd.read_csv('../Resources/val_labels.csv', header=None)

In [3]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(tr_features)

tr_features_scaled = X_scaler.transform(tr_features)
test_features_scaled = X_scaler.transform(test_features)
val_features_scaled = X_scaler.transform(val_features)

In [5]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [6]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features_scaled, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': 16, 'n_estimators': 250}

0.702 (+/-0.1) for {'max_depth': 2, 'n_estimators': 5}
0.738 (+/-0.103) for {'max_depth': 2, 'n_estimators': 50}
0.709 (+/-0.05) for {'max_depth': 2, 'n_estimators': 250}
0.8 (+/-0.035) for {'max_depth': 4, 'n_estimators': 5}
0.848 (+/-0.011) for {'max_depth': 4, 'n_estimators': 50}
0.85 (+/-0.01) for {'max_depth': 4, 'n_estimators': 250}
0.856 (+/-0.027) for {'max_depth': 8, 'n_estimators': 5}
0.876 (+/-0.014) for {'max_depth': 8, 'n_estimators': 50}
0.88 (+/-0.02) for {'max_depth': 8, 'n_estimators': 250}
0.85 (+/-0.034) for {'max_depth': 16, 'n_estimators': 5}
0.886 (+/-0.02) for {'max_depth': 16, 'n_estimators': 50}
0.887 (+/-0.023) for {'max_depth': 16, 'n_estimators': 250}
0.854 (+/-0.026) for {'max_depth': 32, 'n_estimators': 5}
0.887 (+/-0.017) for {'max_depth': 32, 'n_estimators': 50}
0.886 (+/-0.023) for {'max_depth': 32, 'n_estimators': 250}
0.858 (+/-0.027) for {'max_depth': None, 'n_estimators': 5}
0.887 (+/-0.021) for {

#### Write Model

In [7]:
joblib.dump(cv.best_estimator_,'../Saved_model/RF_model.pkl')

['../Saved_model/RF_model.pkl']

#### Prediction on test data

In [8]:
model_predict = joblib.load('../Saved_model/RF_model.pkl')

Predicted = model_predict.predict(test_features_scaled[:20])
Actual = (test_labels[:20]).values.ravel()

In [9]:
Compare_df = pd.DataFrame({
    "Predicted":Predicted,
    "Actual": Actual
})
Compare_df

Unnamed: 0,Predicted,Actual
0,2,2
1,0,0
2,2,2
3,0,0
4,0,1
5,2,2
6,1,1
7,0,0
8,2,2
9,2,2


#### Model validation

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred, average='weighted'), 3)
    recall = round(recall_score(labels, pred, average='weighted'), 3)
    print('{} -- Accuracy: {} / Precision: {} / recall: {} /Latency: {}ms'.format(name,
                                                                            accuracy,
                                                                            precision,
                                                                            recall,
                                                                            round((end - start)*1000, 1)))

In [11]:
evaluate_model('RandomForest Model', model_predict, val_features_scaled, val_labels)

RandomForest Model -- Accuracy: 0.898 / Precision: 0.896 / recall: 0.898 /Latency: 75.8ms
