# Simple Random Forest Exercise

## 1. Load Dataset

In [2]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)

## 2. Split Train/Test Dataset

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
print("training set : " + str(X_train.shape))
print("test set : " + str(X_test.shape))

training set : (700, 2)
test set : (300, 2)


In [8]:
X_train[:3]

array([[ 0.28040927,  1.19302256],
       [ 0.3978422 ,  1.04519451],
       [-0.11182221,  0.38957286]])

In [11]:
y_train[:10]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

## 3. Hyperparameter Tuning

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'max_leaf_nodes': [2, 3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4],
    'max_depth': [3, 5, 10, 15, 20]
}

grid_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    params,
    n_jobs=-1,
    verbose=1,
    cv=3
)

In [13]:
#fit model
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed:    2.0s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7], 'min_samples_split': [2, 3, 4], 'max_depth': [3, 5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [14]:
#check best cross-validation-score
grid_search_cv.best_score_

0.85

In [15]:
#check best estimator and its hyperparameters
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')

## 4. Predict Using Single DecisionTreeClassifier

In [17]:
from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)

0.8566666666666667

## 5. Predict Using RandomForest

In [33]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=4, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=4,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [34]:
pred_rf_y = rnd_clf.predict(X_test)
accuracy_score(y_test, pred_rf_y)

0.86