In [1]:
# Refer: https://machinelearningmastery.com/handle-missing-data-python/,
# http://pandas.pydata.org/pandas-docs/stable/missing_data.html,
# http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values
# pima-indian-diabetes dataset
# 0. Number of times pregnant.
# 1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
# 2. Diastolic blood pressure (mm Hg).
# 3. Triceps skinfold thickness (mm).
# 4. 2-Hour serum insulin (mu U/ml).
# 5. Body mass index (weight in kg/(height in m)^2).
# 6. Diabetes pedigree function.
# 7. Age (years).
# 8. Class variable (0 or 1).

In [4]:
# Numpy 및 Pandas import
import numpy as np
import pandas as pd

In [5]:
# csv 파일 dataframe으로 불러오기
dataset = pd.read_csv('pima-indians-diabetes_edited.csv')
print(dataset.head(20))

     0      1     2     3      4     5      6   7  8
0    6  148.0  72.0  35.0    NaN  33.6  0.627  50  1
1    1   85.0  66.0  29.0    NaN  26.6  0.351  31  0
2    8  183.0  64.0   NaN    NaN  23.3  0.672  32  1
3    1   89.0  66.0  23.0   94.0  28.1  0.167  21  0
4    0  137.0  40.0  35.0  168.0  43.1  2.288  33  1
5    5  116.0  74.0   NaN    NaN  25.6  0.201  30  0
6    3   78.0  50.0  32.0   88.0  31.0  0.248  26  1
7   10  115.0   NaN   NaN    NaN  35.3  0.134  29  0
8    2  197.0  70.0  45.0  543.0  30.5  0.158  53  1
9    8  125.0  96.0   NaN    NaN   NaN  0.232  54  1
10   4  110.0  92.0   NaN    NaN  37.6  0.191  30  0
11  10  168.0  74.0   NaN    NaN  38.0  0.537  34  1
12  10  139.0  80.0   NaN    NaN  27.1  1.441  57  0
13   1  189.0  60.0  23.0  846.0  30.1  0.398  59  1
14   5  166.0  72.0  19.0  175.0  25.8  0.587  51  1
15   7  100.0   NaN   NaN    NaN  30.0  0.484  32  1
16   0  118.0  84.0  47.0  230.0  45.8  0.551  31  1
17   7  107.0  74.0   NaN    NaN  29.6  0.254 

In [6]:
# dataframe 단순 통계치 확인
print(dataset.describe())

                0           1           2           3           4           5  \
count  768.000000  763.000000  735.000000  541.000000  394.000000  757.000000   
mean     3.845052  121.686763   72.208163   29.153420  155.548223   32.457464   
std      3.369578   30.535641   12.928495   10.476982  118.775855    6.924988   
min      0.000000   44.000000    0.000000    7.000000   14.000000   18.200000   
25%      1.000000   99.000000   64.000000   22.000000   76.250000   27.500000   
50%      3.000000  117.000000   72.000000   29.000000  125.000000   32.300000   
75%      6.000000  141.000000   80.000000   36.000000  190.000000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

                6           7           8  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476951  
min      0.078000   21.000000    0.000000  
25%      0.243750   24.000000    0.000000  
50%   

In [7]:
#. 단계 1. 결측값 확인
dataset.isna()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,False,False,False,False,True,False,False,False,False
1,False,False,False,False,True,False,False,False,False
2,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,True,True,False,False,False,False
6,False,False,False,False,False,False,False,False,False
7,False,False,True,True,True,False,False,False,False
8,False,False,False,False,False,False,False,False,False
9,False,False,False,True,True,True,False,False,False


In [8]:
# 각 변수별 결측 개수 확인
dataset.isna().sum(axis=0)

0      0
1      5
2     33
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64

In [9]:
# 단계 2-1. 결측값을 포함한 관측치(행) 제거
dataset_dropna = dataset.dropna()
print(dataset_dropna.head(20))

     0      1      2     3      4     5      6   7  8
3    1   89.0   66.0  23.0   94.0  28.1  0.167  21  0
4    0  137.0   40.0  35.0  168.0  43.1  2.288  33  1
6    3   78.0   50.0  32.0   88.0  31.0  0.248  26  1
8    2  197.0   70.0  45.0  543.0  30.5  0.158  53  1
13   1  189.0   60.0  23.0  846.0  30.1  0.398  59  1
14   5  166.0   72.0  19.0  175.0  25.8  0.587  51  1
16   0  118.0   84.0  47.0  230.0  45.8  0.551  31  1
18   1  103.0   30.0  38.0   83.0  43.3  0.183  33  0
19   1  115.0   70.0  30.0   96.0  34.6  0.529  32  1
20   3  126.0   88.0  41.0  235.0  39.3  0.704  27  0
24  11  143.0   94.0  33.0  146.0  36.6  0.254  51  1
25  10  125.0   70.0  26.0  115.0  31.1  0.205  41  1
27   1   97.0   66.0  15.0  140.0  23.2  0.487  22  0
28  13  145.0   82.0  19.0  110.0  22.2  0.245  57  0
31   3  158.0   76.0  36.0  245.0  31.6  0.851  28  1
32   3   88.0   58.0  11.0   54.0  24.8  0.267  22  0
35   4  103.0   60.0  33.0  192.0  24.0  0.966  33  0
39   4  111.0   72.0  47.0  

In [10]:
# 단계2-2. 결측값에 단순 수치 대입
dataset_fillna100 = dataset.fillna(100)
dataset_fillna100

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,100.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,100.0,26.6,0.351,31,0
2,8,183.0,64.0,100.0,100.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,100.0,100.0,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,100.0,100.0,100.0,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,100.0,100.0,100.0,0.232,54,1


In [11]:
# 결측값에 변수별로 단순 수치 대입
values={'2': 100, '3':1000, '4':10000, '5':20}
dataset_fillna_bycolumn = dataset.fillna(value=values)
dataset_fillna_bycolumn

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,10000.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,10000.0,26.6,0.351,31,0
2,8,183.0,64.0,1000.0,10000.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,1000.0,10000.0,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,100.0,1000.0,10000.0,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,1000.0,10000.0,20.0,0.232,54,1


In [12]:
# 단계2-3. 결측값에 평균/중간값/최빈값 대입
from sklearn.impute import SimpleImputer
imputer_mean = SimpleImputer(missing_values=np.nan, strategy ='mean')
imputer_mean.fit(dataset)
print(imputer_mean.transform(dataset)[:20,:5])

[[  6.         148.          72.          35.         155.54822335]
 [  1.          85.          66.          29.         155.54822335]
 [  8.         183.          64.          29.15341959 155.54822335]
 [  1.          89.          66.          23.          94.        ]
 [  0.         137.          40.          35.         168.        ]
 [  5.         116.          74.          29.15341959 155.54822335]
 [  3.          78.          50.          32.          88.        ]
 [ 10.         115.          72.20816327  29.15341959 155.54822335]
 [  2.         197.          70.          45.         543.        ]
 [  8.         125.          96.          29.15341959 155.54822335]
 [  4.         110.          92.          29.15341959 155.54822335]
 [ 10.         168.          74.          29.15341959 155.54822335]
 [ 10.         139.          80.          29.15341959 155.54822335]
 [  1.         189.          60.          23.         846.        ]
 [  5.         166.          72.          19.   

In [22]:
?imputer_mean

In [13]:
imputer_mean.statistics_

array([  3.84505208, 121.68676278,  72.20816327,  29.15341959,
       155.54822335,  32.45746367,   0.4718763 ,  33.24088542,
         0.34895833])

In [14]:
imputer_median = SimpleImputer(strategy='median')
imputer_median.fit(dataset)
print(imputer_median.transform(dataset)[:20, :5])

[[  6. 148.  72.  35. 125.]
 [  1.  85.  66.  29. 125.]
 [  8. 183.  64.  29. 125.]
 [  1.  89.  66.  23.  94.]
 [  0. 137.  40.  35. 168.]
 [  5. 116.  74.  29. 125.]
 [  3.  78.  50.  32.  88.]
 [ 10. 115.  72.  29. 125.]
 [  2. 197.  70.  45. 543.]
 [  8. 125.  96.  29. 125.]
 [  4. 110.  92.  29. 125.]
 [ 10. 168.  74.  29. 125.]
 [ 10. 139.  80.  29. 125.]
 [  1. 189.  60.  23. 846.]
 [  5. 166.  72.  19. 175.]
 [  7. 100.  72.  29. 125.]
 [  0. 118.  84.  47. 230.]
 [  7. 107.  74.  29. 125.]
 [  1. 103.  30.  38.  83.]
 [  1. 115.  70.  30.  96.]]


In [17]:
imputer_mode = SimpleImputer(strategy='most_frequent')
imputer_mode.fit(dataset)
print(imputer_mode.transform(dataset)[:20, :5])

[[  6. 148.  72.  35. 105.]
 [  1.  85.  66.  29. 105.]
 [  8. 183.  64.  32. 105.]
 [  1.  89.  66.  23.  94.]
 [  0. 137.  40.  35. 168.]
 [  5. 116.  74.  32. 105.]
 [  3.  78.  50.  32.  88.]
 [ 10. 115.  70.  32. 105.]
 [  2. 197.  70.  45. 543.]
 [  8. 125.  96.  32. 105.]
 [  4. 110.  92.  32. 105.]
 [ 10. 168.  74.  32. 105.]
 [ 10. 139.  80.  32. 105.]
 [  1. 189.  60.  23. 846.]
 [  5. 166.  72.  19. 175.]
 [  7. 100.  70.  32. 105.]
 [  0. 118.  84.  47. 230.]
 [  7. 107.  74.  32. 105.]
 [  1. 103.  30.  38.  83.]
 [  1. 115.  70.  30.  96.]]


In [18]:
# 단계2-4. 다양한 기법들
# k-최근접 객체법(kNN)
from sklearn.impute import KNNImputer

In [19]:
imputer_knn5 = KNNImputer(n_neighbors=5)
imputer_knn5.fit(dataset)
print(imputer_knn5.transform(dataset)[:20, :5])

[[  6.  148.   72.   35.  169. ]
 [  1.   85.   66.   29.   58.6]
 [  8.  183.   64.   25.8 164.6]
 [  1.   89.   66.   23.   94. ]
 [  0.  137.   40.   35.  168. ]
 [  5.  116.   74.   20.6 102.8]
 [  3.   78.   50.   32.   88. ]
 [ 10.  115.   77.6  34.4 132.6]
 [  2.  197.   70.   45.  543. ]
 [  8.  125.   96.   26.4 165.8]
 [  4.  110.   92.   28.4 145.6]
 [ 10.  168.   74.   32.4 142.6]
 [ 10.  139.   80.   22.6 204. ]
 [  1.  189.   60.   23.  846. ]
 [  5.  166.   72.   19.  175. ]
 [  7.  100.   71.6  27.   75.2]
 [  0.  118.   84.   47.  230. ]
 [  7.  107.   74.   32.2 113.2]
 [  1.  103.   30.   38.   83. ]
 [  1.  115.   70.   30.   96. ]]


In [20]:
# 단계2-4. 다양한 기법들
# k-최근접 객체법(kNN)
from fancyimpute import KNN, SoftImpute, BiScaler

Using TensorFlow backend.


In [11]:
#KNN
?KNN

In [21]:
# KNN
dataset_knn = KNN(k=3).fit_transform(dataset)
print(dataset_knn[:20,:5])

Imputing row 1/768 with 1 missing, elapsed time: 0.099
Imputing row 101/768 with 2 missing, elapsed time: 0.102
Imputing row 201/768 with 1 missing, elapsed time: 0.104
Imputing row 301/768 with 3 missing, elapsed time: 0.106
Imputing row 401/768 with 2 missing, elapsed time: 0.107
Imputing row 501/768 with 0 missing, elapsed time: 0.110
Imputing row 601/768 with 1 missing, elapsed time: 0.112
Imputing row 701/768 with 0 missing, elapsed time: 0.113
[[  6.         148.          72.          35.         125.36558184]
 [  1.          85.          66.          29.          70.05168329]
 [  8.         183.          64.          30.07607134 213.87662594]
 [  1.          89.          66.          23.          94.        ]
 [  0.         137.          40.          35.         168.        ]
 [  5.         116.          74.          18.29069438 109.53902649]
 [  3.          78.          50.          32.          88.        ]
 [ 10.         115.          70.02099219  37.66165695 145.903932  ]
 [

In [22]:
# SoftImpute: Low-rank approach by iterative soft thresholding of SVD decompositions
# ?SoftImpute
# print(dataset.head(20))
softImpute = SoftImpute()
dataset_soft = softImpute.fit_transform(dataset)

[SoftImpute] Max Singular Value of X_init = 5142.039102
[SoftImpute] Iter 1: observed MAE=2.155825 rank=6
[SoftImpute] Iter 2: observed MAE=2.181673 rank=6
[SoftImpute] Iter 3: observed MAE=2.198736 rank=6
[SoftImpute] Iter 4: observed MAE=2.211575 rank=6
[SoftImpute] Iter 5: observed MAE=2.222632 rank=6
[SoftImpute] Iter 6: observed MAE=2.231912 rank=6
[SoftImpute] Iter 7: observed MAE=2.239317 rank=6
[SoftImpute] Iter 8: observed MAE=2.244972 rank=6
[SoftImpute] Iter 9: observed MAE=2.248919 rank=6
[SoftImpute] Iter 10: observed MAE=2.251886 rank=6
[SoftImpute] Iter 11: observed MAE=2.254005 rank=6
[SoftImpute] Iter 12: observed MAE=2.255384 rank=6
[SoftImpute] Iter 13: observed MAE=2.256201 rank=6
[SoftImpute] Iter 14: observed MAE=2.256717 rank=6
[SoftImpute] Iter 15: observed MAE=2.257071 rank=6
[SoftImpute] Iter 16: observed MAE=2.257278 rank=6
[SoftImpute] Iter 17: observed MAE=2.257382 rank=6
[SoftImpute] Iter 18: observed MAE=2.257386 rank=6
[SoftImpute] Iter 19: observed MAE=

In [23]:
print(dataset_soft[:20,:5])

[[  6.         148.          72.          35.          83.40405291]
 [  1.          85.          66.          29.          49.92790767]
 [  8.         183.          64.          26.05686003  99.3303461 ]
 [  1.          89.          66.          23.          94.        ]
 [  0.         137.          40.          35.         168.        ]
 [  5.         116.          74.          23.59244348  64.90772331]
 [  3.          78.          50.          32.          88.        ]
 [ 10.         115.          62.08919538  26.7730348   65.96366803]
 [  2.         197.          70.          45.         543.        ]
 [  8.         125.          96.          30.91112659  70.95231048]
 [  4.         110.          92.          29.33412665  64.12396438]
 [ 10.         168.          74.          31.99928197  94.14876792]
 [ 10.         139.          80.          27.30300539  76.47704145]
 [  1.         189.          60.          23.         846.        ]
 [  5.         166.          72.          19.   

In [15]:
# rescale both rows and columns to have zero mean and unit variance
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()
dataset_normalized = biscaler.fit_transform(np.array(dataset))

[BiScaler] Initial log residual value = 14.347805
[BiScaler] Iter 1: log residual = 4.141561, log improvement ratio=10.206244
[BiScaler] Iter 2: log residual = 3.289745, log improvement ratio=0.851816
[BiScaler] Iter 3: log residual = 2.831525, log improvement ratio=0.458219
[BiScaler] Iter 4: log residual = 2.354119, log improvement ratio=0.477407
[BiScaler] Iter 5: log residual = 1.807898, log improvement ratio=0.546221
[BiScaler] Iter 6: log residual = 1.187851, log improvement ratio=0.620047
[BiScaler] Iter 7: log residual = 0.502532, log improvement ratio=0.685319
[BiScaler] Iter 8: log residual = -0.232677, log improvement ratio=0.735209
[BiScaler] Iter 9: log residual = -1.002123, log improvement ratio=0.769446
[BiScaler] Iter 10: log residual = -1.793301, log improvement ratio=0.791178
[BiScaler] Iter 11: log residual = -2.597555, log improvement ratio=0.804254
[BiScaler] Iter 12: log residual = -3.409410, log improvement ratio=0.811854
[BiScaler] Iter 13: log residual = -4.225

In [16]:
print(dataset_normalized[:20,:5])

[[-0.24252441  1.04559851 -0.26551817  0.41522011         nan]
 [-0.24935283 -1.72655987 -0.48699831  0.41661519         nan]
 [ 0.9278903   1.82826769 -0.67864308         nan         nan]
 [ 0.28963329 -1.21183387 -0.27501441 -0.3778124  -0.65264043]
 [-1.05930674  0.46787972 -2.24538545  0.4958034   0.23263697]
 [ 1.38828952 -0.16089429  0.58085641         nan         nan]
 [ 0.65159225 -1.30787596 -1.73703444  0.63017188 -0.56430784]
 [ 2.02550989 -0.21157214         nan         nan         nan]
 [-1.11003132  1.03377572 -0.28984593  0.47104999  1.86932058]
 [-0.03964488  0.03703792  1.22585831         nan         nan]
 [-0.43083778 -0.41613245  2.07185159         nan         nan]
 [ 0.94583167  1.53349621 -0.09904386         nan         nan]
 [ 0.7259307   0.42237999  0.33706135         nan         nan]
 [-0.90833795  0.66125287 -0.48657891 -0.39770544  2.37495646]
 [-0.06395159  1.49260735 -0.09416066 -1.20669629  0.35215839]
 [ 1.94053539 -1.22286737         nan         nan      

In [17]:
dataset_soft2_norm= softImpute
dataset_soft2 = biscaler.inverse_transform(dataset_soft2_norm)

[SoftImpute] Max Singular Value of X_init = 46.973022
[SoftImpute] Iter 1: observed MAE=0.026762 rank=8
[SoftImpute] Iter 2: observed MAE=0.026780 rank=8
[SoftImpute] Iter 3: observed MAE=0.026806 rank=8
[SoftImpute] Iter 4: observed MAE=0.026838 rank=8
[SoftImpute] Iter 5: observed MAE=0.026872 rank=8
[SoftImpute] Iter 6: observed MAE=0.026907 rank=8
[SoftImpute] Iter 7: observed MAE=0.026941 rank=8
[SoftImpute] Iter 8: observed MAE=0.026973 rank=8
[SoftImpute] Iter 9: observed MAE=0.027004 rank=8
[SoftImpute] Iter 10: observed MAE=0.027033 rank=8
[SoftImpute] Iter 11: observed MAE=0.027061 rank=8
[SoftImpute] Iter 12: observed MAE=0.027086 rank=8
[SoftImpute] Iter 13: observed MAE=0.027110 rank=8
[SoftImpute] Iter 14: observed MAE=0.027132 rank=8
[SoftImpute] Iter 15: observed MAE=0.027152 rank=8
[SoftImpute] Iter 16: observed MAE=0.027170 rank=8
[SoftImpute] Iter 17: observed MAE=0.027186 rank=8
[SoftImpute] Iter 18: observed MAE=0.027201 rank=8
[SoftImpute] Iter 19: observed MAE=0.

In [24]:
print(dataset_soft2[:20,:5])

[[ 6.00000000e+00  1.48000000e+02  7.20000000e+01  3.50000000e+01
   1.56835814e+02]
 [ 1.00000000e+00  8.50000000e+01  6.60000000e+01  2.90000000e+01
   1.27023365e+02]
 [ 8.00000000e+00  1.83000000e+02  6.40000000e+01  2.67557674e+01
   1.75136517e+02]
 [ 1.00000000e+00  8.90000000e+01  6.60000000e+01  2.30000000e+01
   9.40000000e+01]
 [ 4.44089210e-16  1.37000000e+02  4.00000000e+01  3.50000000e+01
   1.68000000e+02]
 [ 5.00000000e+00  1.16000000e+02  7.40000000e+01  2.71743260e+01
   1.42157973e+02]
 [ 3.00000000e+00  7.80000000e+01  5.00000000e+01  3.20000000e+01
   8.80000000e+01]
 [ 1.00000000e+01  1.15000000e+02  7.23590007e+01  3.11451736e+01
   1.36394451e+02]
 [ 2.00000000e+00  1.97000000e+02  7.00000000e+01  4.50000000e+01
   5.43000000e+02]
 [ 8.00000000e+00  1.25000000e+02  9.60000000e+01  3.40246298e+01
   1.54691519e+02]
 [ 4.00000000e+00  1.10000000e+02  9.20000000e+01  3.14995320e+01
   1.34179263e+02]
 [ 1.00000000e+01  1.68000000e+02  7.40000000e+01  3.21714756e+01

In [25]:
dataset_knn2_norm = KNN
dataset_knn2 = biscaler

Imputing row 1/768 with 1 missing, elapsed time: 0.121
Imputing row 101/768 with 2 missing, elapsed time: 0.124
Imputing row 201/768 with 1 missing, elapsed time: 0.126
Imputing row 301/768 with 3 missing, elapsed time: 0.128
Imputing row 401/768 with 2 missing, elapsed time: 0.130
Imputing row 501/768 with 0 missing, elapsed time: 0.132
Imputing row 601/768 with 1 missing, elapsed time: 0.135
Imputing row 701/768 with 0 missing, elapsed time: 0.137


In [26]:
print(dataset_knn[:20,:5])
print(dataset_knn2[:20,:5])

[[  6.         148.          72.          35.         125.36558184]
 [  1.          85.          66.          29.          70.05168329]
 [  8.         183.          64.          30.07607134 213.87662594]
 [  1.          89.          66.          23.          94.        ]
 [  0.         137.          40.          35.         168.        ]
 [  5.         116.          74.          18.29069438 109.53902649]
 [  3.          78.          50.          32.          88.        ]
 [ 10.         115.          70.02099219  37.66165695 145.903932  ]
 [  2.         197.          70.          45.         543.        ]
 [  8.         125.          96.          22.72225287 203.1200001 ]
 [  4.         110.          92.          33.71030344 138.17720456]
 [ 10.         168.          74.          36.87399147 159.47526636]
 [ 10.         139.          80.          23.43560973 176.35001733]
 [  1.         189.          60.          23.         846.        ]
 [  5.         166.          72.          19.   