# Decision Trees & Random Forests

Tutor link: https://nthu-datalab.github.io/ml/labs/03_Decision-Tree_Random-Forest/03_Decision-Tree_Random-Forest.html


In [150]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline


## Create unoptimized model


In [151]:
# load the breast_cancer dataset
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(data=X, columns=init_data["feature_names"])
y = pd.DataFrame(data=y, columns=["label"])

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values.ravel(), test_size=0.3, random_state=0
)

# Train a RandomForestClassifier as model
forest = RandomForestClassifier(
    criterion="entropy", n_estimators=200, random_state=1, n_jobs=4
)

forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Accuracy per feature: %.2f" % (accuracy_score(y_test, y_pred) / X.shape[1]))


Accuracy: 0.98
Accuracy per feature: 0.03


## Feature selection


In [152]:
# Z-normalize data
sc = StandardScaler()
Z = sc.fit_transform(X.values)
# Estimate the correlation matrix
R = np.dot(Z.T, Z) / X.shape[0]

# Calculate the eigen values, eigen vectors
eigen_vals, eigen_vecs = np.linalg.eigh(R)

# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [
    (np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))
]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)

# Form the projection matrix
num_feature = 2
W = np.array(list(zip(*eigen_pairs))[-1])[:num_feature].T

# Calculate z_pca
Z_pca = Z.dot(W)


## Using selected feature to train model


In [154]:
Z_pca_train, Z_pca_test, y_train, y_test = train_test_split(
    Z_pca, y.values.ravel(), test_size=0.3, random_state=0
)

forest.fit(Z_pca_train, y_train)

y_pred = forest.predict(Z_pca_test)
print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Accuracy per feature: %.2f" % (accuracy_score(y_test, y_pred) / Z_pca.shape[1]))


Accuracy: 0.91
Accuracy per feature: 0.46
