# Random forest
- Iris dataset
- 2 features
- Multiclass

In [None]:
import os, sys, pathlib
UTILS_FOLDER = 'S00 - Utils'
curPath = os.getcwd()
parPath = pathlib.Path(curPath).parent
utilPath = os.path.join(parPath, UTILS_FOLDER)
for p in [curPath, str(parPath), utilPath]:
    sys.path.append(p)

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from utils import plot_ds
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [None]:
WRITE_FILE = False

if WRITE_FILE:
    imagePath = os.path.join(os.getcwd(),'images')
    if (not(os.path.isdir(imagePath))):
        os.mkdir(imagePath)

In [None]:
# Read data
iris = datasets.load_iris()

# Extract the last 2 columns
X = iris.data[:, 2:4]
y = iris.target

# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y
)

# Standardization
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
param = "ex1"
paramSetAll = {
    "ex1": {
        "criterion": "gini",
        "n_estimators": 25,
        "max_samples": None,
        "max_features": "sqrt",
        "max_depth": None,
    },
    "ex2": {
        "criterion": "gini",
        "n_estimators": 100,
        "max_samples": None,
        "max_features": "sqrt",
        "max_depth": 2,
    },
    "ex3": {
        "criterion": "gini",
        "n_estimators": 200,
        "max_samples": 20,
        "max_features": "sqrt",
        "max_depth": 2,
    },
}
paramValue = paramSetAll[param]

# Creating model
forest = RandomForestClassifier(**paramValue)

In [None]:
# Training
forest.fit(X_train_std, y_train)

# Prediction
y_pred = forest.predict(X_test_std)

# Misclassification from the test samples
sumMiss = (y_test != y_pred).sum()

# Accuracy score from the test samples
accuracyScore = accuracy_score(y_test, y_pred)

print(f"Misclassified examples: {sumMiss}")
print(f"Accuracy score: {accuracyScore}")

# Plot decision regions
plot_ds(X_train_std, X_test_std, y_train, y_test, forest)

In [None]:
#Create label names
fn = [ st.replace("(cm)", "(scaled)") for st in iris.feature_names]
print(fn)

#Visualize trees
nTrees = 5
fig, ax = plt.subplots(1, nTrees, figsize=(40, 8))
for i in range(0, nTrees):
    plt.sca(ax[i])
    tree.plot_tree(
        forest.estimators_[i],
        feature_names=iris.feature_names[2:4],
        class_names=iris.target_names,
        filled=True,
    )

if WRITE_FILE:
    filenamePDF = "images/rf_" + param + ".pdf"
    fig.savefig(filenamePDF)