In [1]:
# Loading the library with the iris dataset.
from sklearn.datasets import load_iris

# Loading scikit's random forest classifire library
from sklearn.ensemble import RandomForestClassifier

# Loading Pandas
import pandas as pd

# Loading numpy
import numpy as np

# Setting random seeds
np.random.seed(0)

In [2]:
# Creating an object called iris with the iris data
iris = load_iris()
# print(iris)

# Creating a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Viewing the top 5 rows
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [3]:
# Adding a new column for the species name
df["species"] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Viewing the top 5 rows
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [4]:
# Creating Test and Train data
df["is_train"] = np.random.uniform(0, 1, len(df)) <= 0.75

# Viewing the top 5 rows
print(df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  is_train  
0  setosa      True  
1  setosa      True  
2  setosa      True  
3  setosa      True  
4  setosa      True  


In [5]:
# Creating dataframes with test rows and training rows
train, test = df[df["is_train"]==True], df[df["is_train"]==False]
# Show the number of observations for the test and training dataframes
print("Number of observations in the training data:", len(train))
print("Number of observations in the test data:", len(test))

Number of observations in the training data: 118
Number of observations in the test data: 32


In [6]:
# Create a list of the feature column's names
features = df.columns[:4]

# View features
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [7]:
# Cpnverting each species name into digits
y = pd.factorize(train["species"])[0]

# Viewing target
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


In [8]:
# Creating a random forest Classifier.
clf = RandomForestClassifier(n_estimators=10, n_jobs=2, random_state=0)

# Training the classifier
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
# Applying the trained Classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [10]:
# Viewing the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [17]:
# Mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

# View the  PREDICTED species for the first five observations
print(preds[:5])

['setosa' 'setosa' 'setosa' 'setosa' 'setosa']


In [18]:
# Viewing the ACTUAL species for the first five observations
print(test["species"].head())

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]


In [19]:
# Creating confusion matrix
pd.crosstab(test["species"], preds,rownames=["Actual Species"], colnames=["Predicted Species"])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12
