<a href="https://colab.research.google.com/github/isnanmulia/colab-machinelearning/blob/main/ML_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [13]:
# Loading dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv'
names = ['sepal-length','sepal-width','petal-length','petal-width','class']
dataset = pd.read_csv(url, names=names)
# print(dataset.describe())
dataset_attr = dataset[['sepal-length','sepal-width','petal-length','petal-width']]
print(dataset_attr.describe())

       sepal-length  sepal-width  petal-length  petal-width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [36]:
# Loading dataset with missing values
url = 'https://raw.githubusercontent.com/isnanmulia/lecture-datasets/main/iris10_missing.csv'
names = ['sepal-length','sepal-width','petal-length','petal-width','class']
data_missing = pd.read_csv(url, names=names)
print(data_missing)
data_missing.isna().sum()

   sepal-length  sepal-width  petal-length  petal-width        class
0           NaN          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           NaN          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          NaN           1.4          0.2  Iris-setosa
5           5.4          3.9           1.7          0.4  Iris-setosa
6           4.6          NaN           1.4          0.3  Iris-setosa
7           5.0          3.4           1.5          0.2  Iris-setosa
8           4.4          NaN           1.4          0.2  Iris-setosa
9           4.9          3.1           NaN          0.1  Iris-setosa


sepal-length    1
sepal-width     3
petal-length    2
petal-width     0
class           0
dtype: int64

In [47]:
# Handling missing values
imputer = SimpleImputer(fill_value=np.nan, strategy='mean')
data_missing_attr = data_missing[['sepal-length','sepal-width','petal-length','petal-width']]
X = imputer.fit_transform(data_missing_attr)
X = pd.DataFrame(X, columns=data_missing_attr.columns)
print(X)

   sepal-length  sepal-width  petal-length  petal-width
0      4.833333     3.500000          1.40          0.2
1      4.900000     3.000000          1.45          0.2
2      4.700000     3.200000          1.30          0.2
3      4.600000     3.100000          1.50          0.2
4      5.000000     3.314286          1.40          0.2
5      5.400000     3.900000          1.70          0.4
6      4.600000     3.314286          1.40          0.3
7      5.000000     3.400000          1.50          0.2
8      4.400000     3.314286          1.40          0.2
9      4.900000     3.100000          1.45          0.1


In [86]:
# Sampling
# Simple Random Sampling
data_sampling = dataset.sample(n = 10, replace=True)
# data_sampling = dataset.sample(frac = 0.3)
print(data_sampling.head(10))

# Stratified sampling
# data_sampling_stratified = dataset.groupby('class', group_keys=False).apply(lambda x: x.sample(5))
# data_sampling_stratified = dataset.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=0.1))
# print(data_sampling_stratified.head(20))

     sepal-length  sepal-width  petal-length  petal-width            class
78            6.0          2.9           4.5          1.5  Iris-versicolor
140           6.7          3.1           5.6          2.4   Iris-virginica
76            6.8          2.8           4.8          1.4  Iris-versicolor
78            6.0          2.9           4.5          1.5  Iris-versicolor
59            5.2          2.7           3.9          1.4  Iris-versicolor
140           6.7          3.1           5.6          2.4   Iris-virginica
39            5.1          3.4           1.5          0.2      Iris-setosa
125           7.2          3.2           6.0          1.8   Iris-virginica
144           6.7          3.3           5.7          2.5   Iris-virginica
130           7.4          2.8           6.1          1.9   Iris-virginica


In [26]:
# Scaling the attribute values
scaler = MinMaxScaler(feature_range = (0, 1))
scaled_dataset = scaler.fit_transform(dataset_attr)
print(dataset_attr.head(5))
scaled_dataset[:5,:]

   sepal-length  sepal-width  petal-length  petal-width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2


array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

In [73]:
# Binning/discretization
dataset_bin = dataset_attr[['sepal-width']]
bins = np.linspace(min(dataset_bin['sepal-width']), max(dataset_bin['sepal-width']), 4)
labels = ['A', 'B', 'C']

# Equal-width binning
X1 = pd.cut(dataset_bin['sepal-width'], bins)
X2 = pd.cut(dataset_bin['sepal-width'], bins, labels=labels)
# Equal-frequency binning
Y1 = pd.qcut(dataset_bin['sepal-width'], 3)
Y2 = pd.qcut(dataset_bin['sepal-width'], 3, labels=labels)

dataset_bin = dataset_bin.assign(EqWidthNoLabel=X1)
dataset_bin = dataset_bin.assign(EqWidthLabel=X2)
dataset_bin = dataset_bin.assign(EqFreqNoLabel=Y1)
dataset_bin = dataset_bin.assign(EqFreqLabel=Y2)
print(dataset_bin.head(20))

    sepal-width EqWidthNoLabel EqWidthLabel EqFreqNoLabel EqFreqLabel
0           3.5     (2.8, 3.6]            B    (3.2, 4.4]           C
1           3.0     (2.8, 3.6]            B    (2.9, 3.2]           B
2           3.2     (2.8, 3.6]            B    (2.9, 3.2]           B
3           3.1     (2.8, 3.6]            B    (2.9, 3.2]           B
4           3.6     (2.8, 3.6]            B    (3.2, 4.4]           C
5           3.9     (3.6, 4.4]            C    (3.2, 4.4]           C
6           3.4     (2.8, 3.6]            B    (3.2, 4.4]           C
7           3.4     (2.8, 3.6]            B    (3.2, 4.4]           C
8           2.9     (2.8, 3.6]            B  (1.999, 2.9]           A
9           3.1     (2.8, 3.6]            B    (2.9, 3.2]           B
10          3.7     (3.6, 4.4]            C    (3.2, 4.4]           C
11          3.4     (2.8, 3.6]            B    (3.2, 4.4]           C
12          3.0     (2.8, 3.6]            B    (2.9, 3.2]           B
13          3.0     