## Classification of iris flower using supervised learning algorthm

In [1]:
# Lib untuk manipulasi data
import numpy as np
import pandas as pd

# lib untuk analisa statistik
import scipy.stats as sc
import statsmodels.api as sm

# lib untuk visualisasi data
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# lib untuk praproses dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# library untuk evaluasi model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

### 1. Data Acquisition

In [2]:
# load dataset
dataset = pd.read_csv("../dataset/iris.csv")

In [3]:
# show metadata of dataset
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [4]:
# show dataset
print(dataset)

     sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]


### 2. Exploratory Data Analysis

- Correlation plot

In [5]:
# calculate correlation
df = np.round(dataset.corr(method="pearson", numeric_only=True), 4)

In [6]:
# plotting heatmap corr
fig = px.imshow(
  img=df,
  x=['Sepal length', 'Sepal width', 'Petal length', 'Petal width'],
  y=['Sepal length', 'Sepal width', 'Petal length', 'Petal width'],
  color_continuous_scale = 'viridis',
  zmin=-1, zmax=1,
  aspect=True,
  text_auto=True,
)

# customize heatmap corr
fig.update_traces(
  xgap=5, ygap=5,
)

# customize layout
fig.update_layout(
  title="TimeSeries of daily positive, recovery, and dead",
  width=1000, height=500,
)

# show heatmap corr
fig.show()

In [7]:
# create figure on plotly
fig = go.Figure()


# add heatmap corr
fig.add_trace(
  go.Heatmap(
    x=['Sepal length', 'Sepal width', 'Petal length', 'Petal width'],
    y=['Sepal length', 'Sepal width', 'Petal length', 'Petal width'],
    z=df.values.tolist(),
  )
)

# customize heatmap corr
fig.update_traces(
  colorscale='viridis',
  zmin=-1, zmax=1,
  xgap=5, ygap=5,
  text=df.values.tolist(),
  texttemplate="%{text}",
  textfont={"size":14}, 
  hoverinfo='text',
)

# customize layout
fig.update_layout(
  title="Heatmap correlation on iris flowers dataset",
  width=1000, height=500,
)

# show heatmap corr
fig.show()

- Scatter plot

In [8]:
# plotting scatterplot
fig = px.scatter(
  dataset, x="petal_length", y="petal_width", color='species',
)

# customize scatterplot
fig.update_traces(
  marker={
    "size" : 12,
    "opacity": 0.75,
    "line": {"width": 0.5, "color": "black"},
    "symbol": "circle"
  }
)

# customize layout
fig.update_layout(
  title="Scatterplot on iris flowers dataset",
  xaxis_title="Petal length",
  yaxis_title="Petal width",
  width=1000, height=500,
)

# show scatterplot
fig.show()

In [9]:
# plotting scatterplot
fig = px.scatter(
  dataset, x="sepal_length", y="sepal_width", color='species',
)

# customize scatterplot
fig.update_traces(
  marker={
    "size" : 12,
    "opacity": 0.75,
    "line": {"width": 0.75, "color": "black"},
    "symbol": "circle"
  }
)

# customize layout
fig.update_layout(
  title="Scatterplot on iris flowers dataset",
  # xaxis_title="Petal length",
  # yaxis_title="Petal width",
  width=1000, height=500,
)

# show scatterplot
fig.show()

### 3. Data Preprocessing

- set feature and label

In [10]:
# set independent variable and dependent variable
x = dataset[["sepal_width", "sepal_length", "petal_width", "petal_length"]]
y = dataset["species"].values.reshape(-1, 1)

In [11]:
# show features and labels
print(x.shape, y.shape)

(150, 4) (150, 1)


- Scaling dataset

In [12]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(x)

- split validation

In [13]:
# split validation
trainX, testX, trainY, testY = train_test_split(scaled, y, train_size=0.7, test_size=0.3, random_state=42, shuffle=True)

In [14]:
# reshape labels
trainY = trainY.reshape(-1)
testY = testY.reshape(-1)

In [15]:
# show dimension of data train
print(trainX.shape, trainY.shape)

(105, 4) (105,)


In [16]:
# show dimension of data test
print(testX.shape, testY.shape)

(45, 4) (45,)


### 4. Modeling Supervised Learning

In [17]:
from sklearn.tree import DecisionTreeClassifier  # Algorithms of ID3 and C45. But 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [18]:
# supervised learning algorithm
result_ID3 = DecisionTreeClassifier(criterion="entropy", random_state=1234).fit(trainX, trainY).predict(testX)
result_C45 = DecisionTreeClassifier(criterion="gini", random_state=1234).fit(trainX, trainY).predict(testX)
result_GNB = GaussianNB().fit(trainX, trainY).predict(testX)
result_MNB = MultinomialNB().fit(trainX, trainY).predict(testX)
result_KNN = KNeighborsClassifier(n_neighbors=3).fit(trainX, trainY).predict(testX)
result_SVC = SVC(kernel='linear').fit(trainX, trainY).predict(testX)
result_LR = LogisticRegression(max_iter=1000).fit(trainX, trainY).predict(testX)

In [19]:
# join all result with labels
data = {
  "DT - ID3" : result_ID3,
  "DT - C45" : result_C45,
  "NB - Gaussian" : result_GNB,
  "NB - Multinomial" : result_MNB,
  "KNN" : result_KNN,
  "SVC" : result_SVC,
  "LR" : result_LR,
  "Labels": testY,
}

# convert result to dataframe
result = pd.DataFrame(data)

# show dataframe
result.tail()

Unnamed: 0,DT - ID3,DT - C45,NB - Gaussian,NB - Multinomial,KNN,SVC,LR,Labels
40,virginica,virginica,virginica,virginica,virginica,virginica,virginica,virginica
41,versicolor,versicolor,versicolor,versicolor,versicolor,versicolor,virginica,versicolor
42,versicolor,versicolor,versicolor,versicolor,versicolor,versicolor,virginica,versicolor
43,setosa,setosa,setosa,setosa,setosa,setosa,setosa,setosa
44,setosa,setosa,setosa,setosa,setosa,setosa,setosa,setosa


### 5. Model Evaluation

- Confusion matrix

In [20]:
conf_ID3 = confusion_matrix(testY, result_ID3)
conf_ID3

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  2, 11]])

In [21]:
conf_C45 = confusion_matrix(testY, result_C45)
conf_C45

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]])

In [22]:
conf_GNB = confusion_matrix(testY, result_GNB)
conf_GNB

array([[19,  0,  0],
       [ 0, 12,  1],
       [ 0,  0, 13]])

In [23]:
conf_KNN = confusion_matrix(testY, result_KNN)
conf_KNN

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]])

In [24]:
conf_SVC = confusion_matrix(testY, result_SVC)
conf_SVC

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]])

In [25]:
conf_LR = confusion_matrix(testY, result_LR)
conf_LR

array([[19,  0,  0],
       [ 0,  9,  4],
       [ 0,  0, 13]])

- Accuracy, Precision, and Recall

In [26]:
print("Supervised Learning")
print("------------------- \n")
for i in range(len(result.columns)-1):
    print(result.columns[i])
    print("Accuracy \t:", np.round(accuracy_score(result["Labels"], result.iloc[:,i:i+1]), 4))
    print("Precision \t:", np.round(precision_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 4))
    print("Recal \t\t:", np.round(recall_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 4))
    print("F1 \t\t:", np.round(f1_score(result["Labels"], result.iloc[:,i:i+1], average="macro"), 4))
    print("------------------------- \n")

Supervised Learning
------------------- 

DT - ID3
Accuracy 	: 0.9556
Precision 	: 0.9556
Recal 		: 0.9487
F1 		: 0.9484
------------------------- 

DT - C45
Accuracy 	: 1.0
Precision 	: 1.0
Recal 		: 1.0
F1 		: 1.0
------------------------- 

NB - Gaussian
Accuracy 	: 0.9778
Precision 	: 0.9762
Recal 		: 0.9744
F1 		: 0.9743
------------------------- 

NB - Multinomial
Accuracy 	: 0.9333
Precision 	: 0.9246
Recal 		: 0.9231
F1 		: 0.923
------------------------- 

KNN
Accuracy 	: 1.0
Precision 	: 1.0
Recal 		: 1.0
F1 		: 1.0
------------------------- 

SVC
Accuracy 	: 1.0
Precision 	: 1.0
Recal 		: 1.0
F1 		: 1.0
------------------------- 

LR
Accuracy 	: 0.9111
Precision 	: 0.9216
Recal 		: 0.8974
F1 		: 0.8949
------------------------- 

