In [6]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets,tree
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import pandas as pd
from io import StringIO  
from IPython.display import Image  
import pydotplus
import graphviz

In [7]:
iris = datasets.load_iris()
features = iris.feature_names
X = iris.data
Y = iris.target

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state = 14)

In [9]:
clf = RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state = 14)

In [10]:
# Train the classifier
clf.fit(X_train, Y_train)

In [11]:
clf.score(X_test, Y_test)

0.95

In [12]:
feature_importances = pd.DataFrame(clf.feature_importances_, index = features, columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
petal width (cm),0.459124
petal length (cm),0.400622
sepal length (cm),0.11811
sepal width (cm),0.022145


This shows that <b>petal length</b> and <b>petal width</b> are important features as compared to the other two features i.e. <b>sepal length</b> and <b>sepal width</b>.

In [13]:
# Making a classifier picking only important features, 
# picking only those features that have importance value greater than 0.15
sfm = SelectFromModel(clf, threshold = 0.15)

In [14]:
sfm.fit(X_train, Y_train)

In [15]:
# Create a data subset picking only important features out of all the features.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [16]:
X_important_train.shape

(90, 2)

In [17]:
X_important_test.shape

(60, 2)

In [18]:
# New random forest classifier with only important features
clf_important = RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state = 14)

In [19]:
clf_important.fit(X_important_train, Y_train)

In [20]:
clf_important.score(X_important_test, Y_test)

0.9666666666666667

As you can see, even after removing two insignificant features from our dataset, we are able to predict the answers with an **increased score**.

Thus, using Random Forest, we can easily find out what features to focus on.

In [21]:
# All the estimators
len(clf_important.estimators_)

10000