# 2 - Decison Trees & Random_Forests

In [None]:
x, y = None, None
x_test, y_test = None, None
x_train, y_train = None, None
X_train, X_test = None, None

## Fit Decision Tree

In [None]:
 # Set random seed for test-train split:
from sklearn import tree
import numpy as np
import pandas as pd

np.random.seed(1)

# tree settings:
tree_settings = {'criterion': 'entropy',
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'min_impurity_decrease': 0.005 ,
    'random_state': 1,
    'max_leaf_nodes': 10} # use after pruning 

# Create and fit Decision tree classifier
clf = tree.DecisionTreeClassifier().set_params(**tree_settings)

## Fit Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfc_settings = {'oob_score': True,  # berechnet den OOB-Score
    'max_features': n_features,     # Anzahl predictors pro, i.d.r n_features = round(np.sqrt(X_train.shape[1]))
    'random_state': 1,              
    'warm_start': True,
    'n_estimators': 100,            # Anzahl der Bäume
    }

rfr = RandomForestRegressor(**rfc_settings)
rfr.fit(X_train, y_train)

print("Parameters:\n", rfr.get_params())

## Fit Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_settings = {'oob_score': True,  # berechnet den OOB-Score
    'max_features': n_features,     # Anzahl predictors pro, i.d.r n_features = round(np.sqrt(X_train.shape[1]))
    'random_state': 1,              
    'warm_start': True,
    'n_estimators': 100,            # Anzahl der Bäume
    }

rfr = RandomForestClassifier(**rfc_settings)
rfr.fit(X_train, y_train)

print("Parameters:\n", rfr.get_params())

#### Classification Error

In [None]:
err_test = 1 - (y_test == y_test_pred).mean()
err_train = 1 - (y_train == y_train_pred).mean()

print('Test error:', np.round(err_test, 3))
print('Train error:', np.round(err_train, 3))

### MSE

In [None]:
from sklearn.metrics import mean_squared_error

pred = rfr.predict(X_test)
MSE = mean_squared_error(y_test, pred) 
# or 
MSE = np.mean((y_test - y_test_pred) ** 2)

### Resid Plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Predict
pred = rfr.predict(X_test)
error = y_test - pred

plt.figure(figsize=(7, 5))
sns.scatterplot(x=y_test, y=error)
plt.xlabel('predicted medv')
plt.ylabel('error')
plt.show()

### Find Optimal # of feature for Random Forests

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

n_features = np.arange(1, X_train.shape[1] + 1, 1) # number of features to consider at each split
MSE = []

for n in n_features:
    rfr = RandomForestRegressor(max_features=n)
    rfr.fit(X_train, y_train)
    MSE.append( mean_squared_error(y_test, rfr.predict(X_test)))

plt.plot(n_features, MSE, marker='o')

## Check Feature Importance

In [None]:
FI = pd.DataFrame(data={'Feature': x_train.columns.values,
                        'Importance': clf.feature_importances_})

print('Feature importances:\n', FI,
'\n\nTree depth:\n', clf.get_depth(),
'\nNumber of leaves:\n', clf.get_n_leaves(),
'\nTraining error:\n',
np.round(1 - clf.score(x_train, y_train), 3)
)


#### Plot Decision Tree

In [None]:
import matplotlib.pyplot as plt

# Plots
fig, ax = plt.subplots(figsize=(14, 8))
# Plot Decision Tree
tree.plot_tree(clf, ax=ax, fontsize=8, impurity=False, label='Root',
feature_names=x.columns.values,
class_names=['No', 'Yes'])
plt.show()

#### Plot Data

#### Confusion Matrix for Decision Trees

In [None]:
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
# Create confusion matrix
def confusion(y_true, y_pred):
    conf = pd.DataFrame({'predicted': y_pred, 'true': y_true})
    conf = pd.crosstab(conf.predicted, conf.true,
                       margins=True, margins_name="Sum")
    return conf

print('Test data:\n',
      confusion(y_test.T.to_numpy(), y_test_pred))
print('\n\nTrain data:\n',
      confusion(y_train.T.to_numpy(), y_train_pred))

#### n-Fold Cross Validation
mit cost complexity pruning

In [None]:
n_f = 5 # number of folds
# Train:
node = []
score_train, score_test = [], []
i = x_train.index
# Crossval size:
cv_size = int(len(i) / n_f)

for fold in range(n_f):
    """ 1. Split train data in train/crossval fold """
    # Index of cross-valdation fold
    i_cv_fold = i[np.arange(fold*cv_size, 
                            (fold + 1)*cv_size,1)]
    # Save DataFrames
    X_train_fold = x_train.drop(i_cv_fold)
    X_cv_fold = x_train.loc[i_cv_fold]
    y_train_fold = y_train.drop(i_cv_fold)
    y_cv_fold = y_train.loc[i_cv_fold]
    
    """ 2. Find score and size of respective Trees T(alpha) """
    path = clf.cost_complexity_pruning_path(X_train_fold,
    y_train_fold)
    for alpha in path.ccp_alphas:
        # Create and fit Decision tree classifier
        clf_cv = tree.DecisionTreeClassifier(ccp_alpha=alpha)
        clf_cv.set_params(**tree_settings)
        clf_cv = clf_cv.fit(X_train_fold, y_train_fold)
        # Save node count:
        node.append(clf_cv.get_n_leaves())
        # Save Scores
        score_train.append(clf_cv.score(X_train_fold, y_train_fold))
        score_test.append(clf_cv.score(X_cv_fold, y_cv_fold))
        
""" 3. Average found scores per node """
node = pd.Series(node)
node_sort = np.sort(node.unique())
score_train = pd.Series(score_train)
score_test = pd.Series(score_test)
score_train_avg, score_test_avg, node_avg = [], [], []

# Average per node
for i in node_sort:
    score_train_avg.append(score_train[node == i].mean())
    score_test_avg.append(score_test[node == i].mean())

# Optimal Treesize:
opt_size = node_sort[np.argmax(score_test_avg)]
print(opt_size)

#### Plot score vs. Size after n-Fold cross Validation

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(node_sort, score_train_avg,
'r-o', drawstyle="steps-post", label='train')
ax.plot(node_sort, score_test_avg,
'g-o', drawstyle="steps-post", label='CV')
ax.set_xlabel("Size")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy vs Tree Size")
plt.legend()
plt.show()

### Loop over n_estimators

In [None]:
from sklearn.ensemble import RandomForestClassifier

m = round(np.sqrt(X_train.shape[1]))

rfc_settings = {'oob_score': True,
    'max_features': m,     # Anzahl predictors pro 
    'random_state': 1,              
    'warm_start': True,
    'n_estimators': 100,            # Anzahl der Bäume
    }

rfr = RandomForestClassifier(**rfc_settings)

# n_estimators to check: 
B = np.arange(15, 500, 2)
MSE = []
for b in B:
    rfr.set_params(n_estimators=b)
    rfr.fit(X_train, y_train)
    MSE.append(1 - rfr.oob_score_)

plt.plot(B, MSE, marker='o')
plt.xlabel('n_estimators')
plt.ylabel('OOB error')
plt.title('OOB error vs. n_estimators')
plt.show()