In [1]:
# Doing the preliminaries :P

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()

feature_labels = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']

X = iris.data

y = iris.target

In [3]:
type(iris)

sklearn.utils.Bunch

In [4]:
type(X)

numpy.ndarray

In [5]:
X.shape

(150, 4)

In [6]:
type(y)

numpy.ndarray

In [7]:
y.shape

(150,)

In [8]:
X[0 : 5]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)

In [11]:
clf = RandomForestClassifier(n_estimators = 10000, random_state = 0, n_jobs = -1)
clf.fit(X_train, y_train)

# Displaying the name and gini importance of each feature

for feature in zip(feature_labels, clf.feature_importances_):
    print (feature)

('Sepal Length', 0.11024282328064565)
('Sepal Width', 0.016255033655398394)
('Petal Length', 0.45028123999239533)
('Petal Width', 0.42322090307156124)


In [12]:
# Identifying and Selecting the most important features

# Now we create a selector object that will use the random forest classifier to identify features that have an importance of > 0.15
sfm = SelectFromModel(clf, threshold = 0.15)

sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.15)

In [13]:
# Displaying the names of the most important features

for feature_list_index in sfm.get_support(indices = True):
    print (feature_labels[feature_list_index])

Petal Length
Petal Width


In [14]:
# Creating a data subset with only the most important features

X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [15]:
# Training the new RandomForestClassifier using only the most important features

clf_important = RandomForestClassifier(n_estimators = 10000, random_state = 0, n_jobs = -1)
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
# Accuracy for the full Feature classifier

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.93333333333333335

In [17]:
# Accuracy for the limited feature classifier

y_important_pred = clf_important.predict(X_important_test)
accuracy_score(y_test, y_important_pred)

0.8833333333333333