## Classification of iris flower using supervised learning algorthm

In [1]:
# lib data manipulations
import numpy as np
import pandas as pd

# lib data visualizaton
import seaborn as sns
import matplotlib.pyplot as plt

# lib data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# lib supervised learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# library evaluation model
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

1. Data Acquisition

In [2]:
# load dataset
dataset = pd.read_csv("../dataset/iris.csv")

In [3]:
# show metadata
print(np.round(dataset.describe(),2))

       sepal_length  sepal_width  petal_length  petal_width
count        150.00       150.00        150.00       150.00
mean           5.84         3.05          3.76         1.20
std            0.83         0.43          1.76         0.76
min            4.30         2.00          1.00         0.10
25%            5.10         2.80          1.60         0.30
50%            5.80         3.00          4.35         1.30
75%            6.40         3.30          5.10         1.80
max            7.90         4.40          6.90         2.50


In [4]:
# show dataset
print(dataset.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


2. Data Visualization

3. Data Preprocessing

In [5]:
# Set features and Labels
x = dataset[["sepal_length","sepal_width","petal_length","petal_width"]].values
y = dataset["species"].values

In [6]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(x)

In [7]:
# split validation
trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.7, test_size=0.3, random_state=7, shuffle=True)

In [8]:
# show dimension of data train
print(trainX.shape, trainY.shape)

(105, 4) (105,)


In [9]:
# show dimension of data test
print(testX.shape, testY.shape)

(45, 4) (45,)


4. Modeling Supervised Learning

- Decision Tree C45

In [10]:
# supervised learning algorithm
result_C45 = DecisionTreeClassifier(criterion="gini", random_state=None).fit(trainX, trainY).predict(testX)

In [11]:
# evaluation models - confusion_matrix
conf_C45 = confusion_matrix(testY, result_C45)
conf_C45

array([[12,  0,  0],
       [ 0, 12,  4],
       [ 0,  2, 15]], dtype=int64)

In [12]:
# evaluation models - classification_report
print(classification_report(y_true=testY, y_pred=result_C45))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.86      0.75      0.80        16
   virginica       0.79      0.88      0.83        17

    accuracy                           0.87        45
   macro avg       0.88      0.88      0.88        45
weighted avg       0.87      0.87      0.87        45



- Naive Bayes - Gaussian

In [13]:
# supervised learning algorithm
result_gnb = GaussianNB().fit(trainX, trainY).predict(testX)

In [14]:
# evaluation models - confusion_matrix
conf_gnb = confusion_matrix(testY, result_gnb)
conf_gnb

array([[12,  0,  0],
       [ 0, 13,  3],
       [ 0,  2, 15]], dtype=int64)

In [15]:
# evaluation models - classification_report
print(classification_report(y_true=testY, y_pred=result_gnb))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.87      0.81      0.84        16
   virginica       0.83      0.88      0.86        17

    accuracy                           0.89        45
   macro avg       0.90      0.90      0.90        45
weighted avg       0.89      0.89      0.89        45



- K Nearest-Neighbor

In [16]:
# supervised learning algorithm
result_knn = KNeighborsClassifier(n_neighbors=3).fit(trainX, trainY).predict(testX)

In [17]:
# evaluation models - confusion_matrix
conf_knn = confusion_matrix(testY, result_knn)
conf_knn

array([[12,  0,  0],
       [ 0, 16,  0],
       [ 0,  2, 15]], dtype=int64)

In [18]:
# evaluation models - classification_report
print(classification_report(y_true=testY, y_pred=result_knn))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.89      1.00      0.94        16
   virginica       1.00      0.88      0.94        17

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



- Logistic Regression

In [19]:
# supervised learning algorithm
result_lr = LogisticRegression(max_iter=1000).fit(trainX, trainY).predict(testX)

In [20]:
# evaluation models - confusion_matrix
conf_lr = confusion_matrix(testY, result_lr)
conf_lr

array([[12,  0,  0],
       [ 0, 11,  5],
       [ 0,  2, 15]], dtype=int64)

In [21]:
# evaluation models - classification_report
print(classification_report(y_true=testY, y_pred=result_lr))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.85      0.69      0.76        16
   virginica       0.75      0.88      0.81        17

    accuracy                           0.84        45
   macro avg       0.87      0.86      0.86        45
weighted avg       0.85      0.84      0.84        45



- Support Vector Classifier

In [22]:
# supervised learning algorithm
result_svc = SVC(kernel='rbf').fit(trainX, trainY).predict(testX)

In [23]:
# evaluation models - confusion_matrix
conf_svc = confusion_matrix(testY, result_svc)
conf_svc

array([[12,  0,  0],
       [ 0, 14,  2],
       [ 0,  1, 16]], dtype=int64)

In [24]:
# evaluation models - classification_report
print(classification_report(y_true=testY, y_pred=result_svc))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        12
  versicolor       0.93      0.88      0.90        16
   virginica       0.89      0.94      0.91        17

    accuracy                           0.93        45
   macro avg       0.94      0.94      0.94        45
weighted avg       0.93      0.93      0.93        45

