In [1]:
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier



In [2]:
# Generate data
X, y = samples_generator.make_classification(n_samples=150, n_features=25, n_classes=3, n_informative=6, n_redundant=0, random_state=7)

In [3]:
# Select top K features
k_best_selector = SelectKBest(f_regression, k=9)

In [4]:
# Initialize Extremely Random Forests classifier
classifier = ExtraTreesClassifier(n_estimators=60, max_depth=4)

In [5]:
# Construct the pipeline
processor_pipeline = Pipeline([('selector', k_best_selector), ('erf', classifier)])

In [6]:
# Set the parameters
processor_pipeline.set_params(selector__k=7, erf__n_estimators=30)

Pipeline(memory=None,
         steps=[('selector',
                 SelectKBest(k=7,
                             score_func=<function f_regression at 0x7f6d507539e0>)),
                ('erf',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=4, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=30, n_jobs=None,
                                      oob_score=False, random_state=None,
                                      verbose=0, warm_start=False))],
         verbose=False)

In [7]:
# Training the pipeline
processor_pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('selector',
                 SelectKBest(k=7,
                             score_func=<function f_regression at 0x7f6d507539e0>)),
                ('erf',
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_depth=4, max_features='auto',
                                      max_leaf_nodes=None, max_samples=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=30, n_jobs=None,
                                      oob_score=False, random_state=None,
                                      verbose=0, warm_start=False))],
         verbose=False)

In [8]:
# Predict outputs for the input data
output = processor_pipeline.predict(X)
print("\nPredicted output:\n", output)


Predicted output:
 [1 2 2 0 2 0 2 1 0 1 1 2 1 0 2 2 1 0 1 1 0 2 1 1 2 2 0 0 1 2 1 2 1 0 2 2 1
 1 2 2 2 0 1 2 2 1 2 2 1 0 1 2 2 2 2 0 2 2 0 2 2 0 1 0 2 1 0 1 1 2 1 0 0 2
 0 0 1 2 2 0 0 1 2 2 2 0 0 0 2 2 2 1 2 0 2 1 2 2 0 0 1 1 1 1 2 2 2 2 0 1 1
 0 2 1 0 0 1 1 1 1 0 0 0 1 2 0 0 0 2 1 2 0 0 1 0 1 1 0 1 1 1 2 2 2 0 1 2 0
 2 2]


In [9]:
# Print scores
print("\nScore:", processor_pipeline.score(X, y))


Score: 0.88


In [10]:
# Print the features chosen by the pipeline selector
status = processor_pipeline.named_steps['selector'].get_support()
# Extract and print indices of selected features
selected = [i for i, x in enumerate(status) if x]
print("\nIndices of selected features:", ', '.join([str(x) for x in selected]))


Indices of selected features: 4, 7, 8, 12, 14, 17, 22
