FInd Iris flowers using Rnadom Forest.

Problem Statement: Predict Species of flowers using machine learning in Python.

In [110]:
#@title import libraries
import pandas as pd
import numpy as np


from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt

%matplotlib inline

In [111]:
#@title Setting random seed

np.random.seed(0)

In [116]:
#@title Load Data

iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [117]:
#@title Length & Shape of DataSet

print('Dataset Length:: ', len(data))
print('Dataset Shape:: ', data.shape)

Dataset Length::  150
Dataset Shape::  (150, 4)


Data Preperation

In [120]:
#@title Adding new column for the species name

data['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [123]:
#@title Creating test and train data

data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [125]:
#@title creating ddataframes with test rows and training rows

train, test = data[data['is_train'] == True], data[data['is_train'] == False]
print('Training Data length: ', len(train))
print('Testing Data length: ', len(test))
print('Training Data: \n', train.head())
print('Testing Data: \n', test.head())

Training Data length:  114
Testing Data length:  36
Training Data: 
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   
5                5.4               3.9                1.7               0.4   
6                4.6               3.4                1.4               0.3   

  species  is_train  
2  setosa      True  
3  setosa      True  
4  setosa      True  
5  setosa      True  
6  setosa      True  
Testing Data: 
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                 5.1               3.5                1.4               0.2   
1                 4.9               3.0                1.4               0.2   
10                5.4               3.7                1.5          

In [126]:
features = data.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [128]:
#@title converting species name into digits

y = pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2])

Implementing Model

In [132]:
#@title Creating RandomForest Classifier

model = RandomForestClassifier(n_jobs=2, random_state=0)

In [133]:
#@title Training Model

model.fit(train[features], y)

In [134]:
#@title Making predictions

y_pred = model.predict(test[features])
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [141]:
#@title Viewing predicted probabilities of the first 10 observations

model.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.01, 0.92, 0.07]])

In [146]:
#@title Mapping names for the plants
preds = iris.target_names[y_pred]

In [148]:
#@title VIew PREDICTED Species for first five observations

preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [149]:
#@title View ACTUAL Species for first five observations

test['species'].head()

0     setosa
1     setosa
10    setosa
18    setosa
31    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

 Model Evaluation

In [150]:
#@title Creating a Confusion Matrix

pd.crosstab(test['species'], preds, rownames = ['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,12,2
virginica,0,0,13


In [154]:
#@title Calculating Accuracy Score

score = accuracy_score(test['species'], preds)
print(f'Accuracy is {score*100:.2f}%')

Accuracy is 94.44%


In [159]:
#@title Make Predictions

random_prediction = iris.target_names[model.predict([[5.0, 3.6, 1.4, 2.0],[1.0, 3.6, 1.4, 2.0]])]



In [160]:
random_prediction

array(['setosa', 'setosa'], dtype='<U10')