# Experiment 7: Missing values Imputation using Horse Colic dataset
To apply the following imputation methods for horse-colic dataset.
- Statistical Imputation
- KNN Imputation
- Iterative Imputation

## Import Data

In [20]:
import numpy as np,  pandas as pd
# load dataset using github url
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = pd.read_csv(url, header=None, na_values='?')
# header = None: column names are inferred from the first line of the file.
# na_values: Additional strings to recognize as NA/NaN.

# load data as numpy array
data = dataframe.values


In [21]:
dataframe.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       299 non-null    float64
 1   1       300 non-null    int64  
 2   2       300 non-null    int64  
 3   3       240 non-null    float64
 4   4       276 non-null    float64
 5   5       242 non-null    float64
 6   6       244 non-null    float64
 7   7       231 non-null    float64
 8   8       253 non-null    float64
 9   9       268 non-null    float64
 10  10      245 non-null    float64
 11  11      256 non-null    float64
 12  12      244 non-null    float64
 13  13      196 non-null    float64
 14  14      194 non-null    float64
 15  15      53 non-null     float64
 16  16      198 non-null    float64
 17  17      182 non-null    float64
 18  18      271 non-null    float64
 19  19      267 non-null    float64
 20  20      135 non-null    float64
 21  21      102 non-null    float64
 22  22

## Missing Data Info

In [22]:
def missing_data_info(dataframe):
    # count number of rows with missing values in each column:
    for i in range(dataframe.shape[1]): # range of no.of col
        # no. of missing in each col:
        n_miss = dataframe.iloc[:,i].isnull().sum()
        # percentage of miss in each col:
        percentage_miss = n_miss / dataframe.shape[0] * 100
        print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, percentage_miss))
    return n_miss

# call function
missing_data_info(dataframe)

> 0, Missing: 1 (0.3%)
> 1, Missing: 0 (0.0%)
> 2, Missing: 0 (0.0%)
> 3, Missing: 60 (20.0%)
> 4, Missing: 24 (8.0%)
> 5, Missing: 58 (19.3%)
> 6, Missing: 56 (18.7%)
> 7, Missing: 69 (23.0%)
> 8, Missing: 47 (15.7%)
> 9, Missing: 32 (10.7%)
> 10, Missing: 55 (18.3%)
> 11, Missing: 44 (14.7%)
> 12, Missing: 56 (18.7%)
> 13, Missing: 104 (34.7%)
> 14, Missing: 106 (35.3%)
> 15, Missing: 247 (82.3%)
> 16, Missing: 102 (34.0%)
> 17, Missing: 118 (39.3%)
> 18, Missing: 29 (9.7%)
> 19, Missing: 33 (11.0%)
> 20, Missing: 165 (55.0%)
> 21, Missing: 198 (66.0%)
> 22, Missing: 1 (0.3%)
> 23, Missing: 0 (0.0%)
> 24, Missing: 0 (0.0%)
> 25, Missing: 0 (0.0%)
> 26, Missing: 0 (0.0%)
> 27, Missing: 0 (0.0%)


0

In [23]:
# replication
def missing_val_cnt(dataframe):
    df = dataframe
    for i in range(df.shape[1]): # i is col
        n_miss = df.iloc[:,i].isnull().sum()
        per_miss = (n_miss / df.shape[0]) * 100
        print("> in col %d, n_miss: %d, percentage: %.1f%%" %  (i,n_miss,per_miss))

missing_val_cnt(dataframe)


> in col 0, n_miss: 1, percentage: 0.3%
> in col 1, n_miss: 0, percentage: 0.0%
> in col 2, n_miss: 0, percentage: 0.0%
> in col 3, n_miss: 60, percentage: 20.0%
> in col 4, n_miss: 24, percentage: 8.0%
> in col 5, n_miss: 58, percentage: 19.3%
> in col 6, n_miss: 56, percentage: 18.7%
> in col 7, n_miss: 69, percentage: 23.0%
> in col 8, n_miss: 47, percentage: 15.7%
> in col 9, n_miss: 32, percentage: 10.7%
> in col 10, n_miss: 55, percentage: 18.3%
> in col 11, n_miss: 44, percentage: 14.7%
> in col 12, n_miss: 56, percentage: 18.7%
> in col 13, n_miss: 104, percentage: 34.7%
> in col 14, n_miss: 106, percentage: 35.3%
> in col 15, n_miss: 247, percentage: 82.3%
> in col 16, n_miss: 102, percentage: 34.0%
> in col 17, n_miss: 118, percentage: 39.3%
> in col 18, n_miss: 29, percentage: 9.7%
> in col 19, n_miss: 33, percentage: 11.0%
> in col 20, n_miss: 165, percentage: 55.0%
> in col 21, n_miss: 198, percentage: 66.0%
> in col 22, n_miss: 1, percentage: 0.3%
> in col 23, n_miss: 0, 

## 1. Statistical Imputation

In [24]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
# fit on the dataset
imputer.fit(data)
# transform the dataset
Xtrans1 = imputer.transform(data)
# print total missing
# Flatten(): Return a copy of the array collapsed into one dimension.
print('Missing: ', sum(np.isnan(Xtrans1).flatten()))


Missing:  0


In [25]:
# replication
print("Missing", sum(np.isnan(Xtrans1).flatten()))

Missing 0


## 2. Iterative Imputer

In [26]:
missing_data_info(dataframe)

> 0, Missing: 1 (0.3%)
> 1, Missing: 0 (0.0%)
> 2, Missing: 0 (0.0%)
> 3, Missing: 60 (20.0%)
> 4, Missing: 24 (8.0%)
> 5, Missing: 58 (19.3%)
> 6, Missing: 56 (18.7%)
> 7, Missing: 69 (23.0%)
> 8, Missing: 47 (15.7%)
> 9, Missing: 32 (10.7%)
> 10, Missing: 55 (18.3%)
> 11, Missing: 44 (14.7%)
> 12, Missing: 56 (18.7%)
> 13, Missing: 104 (34.7%)
> 14, Missing: 106 (35.3%)
> 15, Missing: 247 (82.3%)
> 16, Missing: 102 (34.0%)
> 17, Missing: 118 (39.3%)
> 18, Missing: 29 (9.7%)
> 19, Missing: 33 (11.0%)
> 20, Missing: 165 (55.0%)
> 21, Missing: 198 (66.0%)
> 22, Missing: 1 (0.3%)
> 23, Missing: 0 (0.0%)
> 24, Missing: 0 (0.0%)
> 25, Missing: 0 (0.0%)
> 26, Missing: 0 (0.0%)
> 27, Missing: 0 (0.0%)


0

In [27]:
# IterativeImputer is experimental and the API might change without any deprecation cycle.
# To use it, you need to explicitly import enable_iterative_imputer:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# collect data values in numpy array
data = dataframe.values
# define imputer
imputer = IterativeImputer()
# fit on the dataset
imputer.fit(data)
# transform the dataset
Xtrans2 = imputer.transform(data)
# print total missing
print('Missing: ',sum(np.isnan(Xtrans2).flatten()))

Missing:  0


## 3. KNN Imputer

In [28]:
missing_data_info(dataframe)

> 0, Missing: 1 (0.3%)
> 1, Missing: 0 (0.0%)
> 2, Missing: 0 (0.0%)
> 3, Missing: 60 (20.0%)
> 4, Missing: 24 (8.0%)
> 5, Missing: 58 (19.3%)
> 6, Missing: 56 (18.7%)
> 7, Missing: 69 (23.0%)
> 8, Missing: 47 (15.7%)
> 9, Missing: 32 (10.7%)
> 10, Missing: 55 (18.3%)
> 11, Missing: 44 (14.7%)
> 12, Missing: 56 (18.7%)
> 13, Missing: 104 (34.7%)
> 14, Missing: 106 (35.3%)
> 15, Missing: 247 (82.3%)
> 16, Missing: 102 (34.0%)
> 17, Missing: 118 (39.3%)
> 18, Missing: 29 (9.7%)
> 19, Missing: 33 (11.0%)
> 20, Missing: 165 (55.0%)
> 21, Missing: 198 (66.0%)
> 22, Missing: 1 (0.3%)
> 23, Missing: 0 (0.0%)
> 24, Missing: 0 (0.0%)
> 25, Missing: 0 (0.0%)
> 26, Missing: 0 (0.0%)
> 27, Missing: 0 (0.0%)


0

In [29]:
from sklearn.impute import KNNImputer
# collect data values in numpy array
data = dataframe.values
imputer = KNNImputer()
# fit on the dataset
imputer.fit(data)
# transform the dataset
Xtrans = imputer.transform(data)
# print total missing
print('Missing: %d' % sum(np.isnan(Xtrans).flatten()))


Missing: 0


The End