In [1]:
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
adult_census = pd.read_csv("./datasets/adult-census.csv")
target_name = "class"
data = adult_census.drop(columns=[target_name, "education-num"])
target = adult_census[target_name]

In [3]:
categorical_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
preprocessor = make_column_transformer(
    (categorical_encoder, make_column_selector(dtype_include=object)),
    remainder="passthrough",
)

In [4]:
tree = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=0))

In [5]:
scores_tree = cross_val_score(tree, data, target)

print(
    "Decision tree classifier: "
    f"{scores_tree.mean():.3f} ± {scores_tree.std():.3f}"
)

Decision tree classifier: 0.820 ± 0.006


In [6]:
bagged_trees = make_pipeline(
    preprocessor,
    BaggingClassifier(
        estimator=DecisionTreeClassifier(random_state=0),
        n_estimators=50,
        n_jobs=2,
        random_state=0,
    ),
)

In [7]:
scores_bagged_trees = cross_val_score(bagged_trees, data, target)

print(
    "Bagged decision tree classifier: "
    f"{scores_bagged_trees.mean():.3f} ± {scores_bagged_trees.std():.3f}"
)

Bagged decision tree classifier: 0.846 ± 0.005


In [8]:
random_forest = make_pipeline(
    preprocessor,
    RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=0),
)

In [9]:
scores_random_forest = cross_val_score(random_forest, data, target)

print(
    "Random forest classifier: "
    f"{scores_random_forest.mean():.3f} ± "
    f"{scores_random_forest.std():.3f}"
)

Random forest classifier: 0.851 ± 0.004
