In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris # imported data set available in sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.random.seed(0)

In [2]:
iris = load_iris()# loas data set
df = pd.DataFrame(iris.data, columns = iris.feature_names) # creating data frame with features
df.head() # to get first 5 rows

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [16]:
df['Species']= pd.Categorical.from_codes(iris.target,iris.target_names)
df.head()
    

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [4]:
df['is_train']= np.random.uniform(0,1,len(df)) <=0.75 # creating train and test dataa where 75% of data is used for training
# 25% for testing
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [5]:
# create train and test datafraame
train, test = df[df['is_train']==True],df[df['is_train']==False] # is_train==true will go to train and is_true==False will go to test

In [6]:
print('Number of observation in the training data:',len(train))
print('Number of observation in the test data:',len(test))

Number of observation in the training data: 118
Number of observation in the test data: 32


In [7]:
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [8]:
y = pd.factorize(train['Species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [9]:
# creating random forest clasifier
clf = RandomForestClassifier(n_jobs=2, random_state =0)
clf.fit(train[features],y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [10]:
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [11]:
clf.predict_proba(test[features])[0:10] # predict probability

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [20]:
# mapping names of plants against probablity
pred = iris.target_names[clf.predict(test[features])]
pred[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [21]:
# check species of first five observtions
test['Species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: Species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [22]:
#creating confusion matrix i.e, combining the test[species] + pred
pd.crosstab(test['Species'],pred, rownames=['Actual species'], colnames=['Predictd species'])

Predictd species,setosa,versicolor,virginica
Actual species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12
