In [1]:
# Import Library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Show first 50 data
df = pd.read_csv('Iris_unclean.csv')
df.head(50)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,,3.5,1.4,0.2,Iris-setosa
1,4.9,2000.0,1.4,0.2,Iris-setosa
2,4.7,3.2,-1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,-1.5,0.2,Iris-setosa
8,4.4,1500.0,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [3]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  148 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
# Remove missing value
df = df.fillna(df.mean()) # (replace missing value using mean)
df.head(50)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.856757,3.5,1.4,0.2,Iris-setosa
1,4.9,2000.0,1.4,0.2,Iris-setosa
2,4.7,3.2,-1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,5.856757,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,-1.5,0.2,Iris-setosa
8,4.4,1500.0,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [5]:
# Label Encoder
X = df.iloc[:, :-1].values
y = df.iloc[:, -1]
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
# Split data to test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [7]:
# Show first 5 data of X_train
X_train[:5]

array([[6.1, 3. , 4.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [5.6, 2.5, 3.9, 1.1],
       [6.4, 2.8, 5.6, 2.1],
       [5.8, 2.8, 5.1, 2.4]])

In [8]:
# Show first 5 data of X_test
X_test[:5]

array([[5.8, 4. , 1.2, 0.2],
       [5.1, 2.5, 3. , 1.1],
       [6.6, 3. , 4.4, 1.4],
       [5.4, 3.9, 1.3, 0.4],
       [7.9, 3.8, 6.4, 2. ]])

In [9]:
# Show first 5 data of Y_train
Y_train[:5]

array([1, 2, 1, 2, 2])

In [10]:
# Show first 5 data of Y_test
Y_test[:5]

array([0, 1, 1, 0, 2])

In [11]:
# Standard Scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [12]:
# Show first 5 data of X_train
X_train[:5]

array([[ 0.29915532, -0.12892512,  0.44911109,  0.2345312 ],
       [ 2.25372112, -0.12892512,  1.25509309,  1.39642889],
       [-0.31164649, -0.13113786,  0.07298615, -0.15276803],
       [ 0.66563641, -0.12981022,  0.98643242,  1.13822941],
       [-0.06732577, -0.12981022,  0.71777176,  1.52552864]])

In [13]:
# Show first 5 data of X_test
X_test[:5]

array([[-0.07829545,  1.70946556, -1.3642592 , -1.32113381],
       [-0.94366618, -1.53635513, -0.31929471, -0.0287203 ],
       [ 0.91069968, -0.4544149 ,  0.49345546,  0.4020842 ],
       [-0.57279301,  1.49307752, -1.30620562, -1.03393081],
       [ 2.51781676,  1.27668947,  1.65452712,  1.26369321]])