In [4]:
import numpy as np
import pandas as pd
from ultra_impute import MissFiller
from util import calculate_metrics

`MissFiller` can be initialized with a dictionary, NumPy ndarray, or pandas DataFrame. Dictionaries and ndarrays will be converted to DataFrames upon initialization of the `MissFiller` class.

In [2]:
# d1 is dictionary
d1 = {
'A':[0, 5, 10, np.nan, 20],
'B': [1, 6, np.nan, 16, 21],
'C':[np.nan, 7, 12, 17, 22],
'D':[3, 8, 13, 18, 23],
'E':[4, 9, 14, 19, np.nan]
}

# d2 is NumPy ndarray
d2 = np.array([
[ 0.,  1., np.nan,  3., 4],
[ 5.,  6.,  7.,  8., 9],
[10., np.nan, 12., 13., 14],
[np.nan, 16., 17., 18., 19],
[20., 21., 22., 23., np.nan]
])

# d3 and d4 are DataFrames created from d1 and d2, respectively.
d3= pd.DataFrame(d1)
d4 = pd.DataFrame(d2, columns=['A','B ','C','D', 'E'])

# Create MissFiller objects, noting that mf1, mf2, mf3, and mf4 
# all contain the same data.
mf1 = MissFiller(d1)
mf2 = MissFiller(d2)
mf3 = MissFiller(d3)
mf4 = MissFiller(d4)
mf1.df

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,,3,4.0
1,5.0,6.0,7.0,8,9.0
2,10.0,,12.0,13,14.0
3,,16.0,17.0,18,19.0
4,20.0,21.0,22.0,23,


In [None]:
# the row and column indices of missing values
na_locations = mf1.get_na_indices()
na_locations

In [24]:
# remove rows with any missing values. Default is axis=0
mf1.remove_na()

Unnamed: 0,A,B,C,D,E
1,5.0,6.0,7.0,8,9.0


In [25]:
# remove columns with any missing values
mf1.remove_na(axis=1)

Unnamed: 0,D
0,3
1,8
2,13
3,18
4,23


In [11]:
# fill missing values with row mean
mf1.fill_trend(axis=1, method='mean')

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,12.25,12.0,13.0,14.0
3,17.5,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,21.5


In [12]:
# fill missing values with column mean
mf1.fill_trend(axis=0, method='mean')

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,14.5,3,4.0
1,5.0,6.0,7.0,8,9.0
2,10.0,11.0,12.0,13,14.0
3,8.75,16.0,17.0,18,19.0
4,20.0,21.0,22.0,23,11.5


In [4]:
# back fill missing values by row
# When axis=1, "back fill" means a missing value will be filled 
# by the value on its rightside
mf1.fill_trend(axis=1, method='bfill')

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,3.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,12.0,12.0,13.0,14.0
3,16.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,


In [5]:
# back fill missing values by row
# When axis=0, "back fill" means a missing value will be filled 
# by the value below.
mf1.fill_trend(axis=0, method='bfill')

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,7.0,3,4.0
1,5.0,6.0,7.0,8,9.0
2,10.0,16.0,12.0,13,14.0
3,20.0,16.0,17.0,18,19.0
4,20.0,21.0,22.0,23,


In [15]:
# Fill by random values chose from the same column
mf1.fill_rand(axis=0)

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,12.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,21.0,12.0,13.0,14.0
3,5.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,4.0


In [16]:
# Fill by random values chose from the same row
mf1.fill_rand(axis=1)

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,4.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,10.0,12.0,13.0,14.0
3,16.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,23.0


In [6]:
# fill missing values by mean calculated from the sliding window. 
# Default axis = 0, means the sliding window will move along the columns.
mf1.fill_mw()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,9.5,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,11.666667,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,16.5


In [6]:
# fill missing values using the 'fKNN' algorithem (fast k-nearest neighbour).
# When axis = 1 (default), search columns for the nearest neighbours.
mf1.fill_fKNN()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,1.673789,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.585308,12.0,13.0,14.0
3,16.627234,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,22.525584


In [7]:
# When axis = 0, search rows for the nearest neighbours.
mf1.fill_fKNN(axis=0)

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,9.505107,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,10.24221,12.0,13.0,14.0
3,11.733663,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,16.302562


In [7]:
# fill missing values using the sklearn's 'KNNImputer'
# When axis = 1 (default), search columns for the nearest neighbours.
# the results of KNN (using the mean) might be slightly different 
# from fKNN (using the weighted mean)
mf1.fill_KNN()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,12.25,12.0,13.0,14.0
3,17.5,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,21.5


In [9]:
# Imputes missing data using the Expectation-Maximization (EM) algorithm.
mf1.fill_EM()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,-1.765171,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,12.520981,12.0,13.0,14.0
3,17.245379,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,20.940237


In [10]:
# Imputes missing data using Buck's Method.
mf1.fill_Buck()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0


In [12]:
# Impute missing values using NuclearNormMinimization.
# This method can be very slow for large dataset.
mf1.fill_NNM()

                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Oct 25 10:36:08 PM: Your problem has 25 variables, 25 constraints, and 0 parameters.
(CVXPY) Oct 25 10:36:08 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 25 10:36:08 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 25 10:36:08 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Oct 25 10:36:08 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 25 10:36:08 PM: Compiling problem (target solver=CVXOPT).
(

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,1.999666,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,10.999334,12.0,13.0,14.0
3,14.998666,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,23.998331


In [17]:
# Matrix completion by iterative soft thresholding of SVD decompositions. 
# Similar to R softImpute package.
mf1.fill_SoftImpute()

[SoftImpute] Max Singular Value of X_init = 58.112124
[SoftImpute] Iter 1: observed MAE=0.439340 rank=5
[SoftImpute] Iter 2: observed MAE=0.443483 rank=5
[SoftImpute] Iter 3: observed MAE=0.438852 rank=4
[SoftImpute] Iter 4: observed MAE=0.434964 rank=4
[SoftImpute] Iter 5: observed MAE=0.434751 rank=4
[SoftImpute] Iter 6: observed MAE=0.436287 rank=4
[SoftImpute] Iter 7: observed MAE=0.438504 rank=4
[SoftImpute] Iter 8: observed MAE=0.440790 rank=4
[SoftImpute] Iter 9: observed MAE=0.442632 rank=4
[SoftImpute] Iter 10: observed MAE=0.443982 rank=4
[SoftImpute] Iter 11: observed MAE=0.431866 rank=3
[SoftImpute] Iter 12: observed MAE=0.423467 rank=3
[SoftImpute] Iter 13: observed MAE=0.417864 rank=3
[SoftImpute] Iter 14: observed MAE=0.412525 rank=3
[SoftImpute] Iter 15: observed MAE=0.407492 rank=3
[SoftImpute] Iter 16: observed MAE=0.403333 rank=3
[SoftImpute] Iter 17: observed MAE=0.405281 rank=3
[SoftImpute] Iter 18: observed MAE=0.405432 rank=4
[SoftImpute] Iter 19: observed MAE=0.

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,1.937622,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,10.966751,12.0,13.0,14.0
3,14.146055,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,22.73716


In [15]:
# Matrix completion by iterative low-rank SVD decomposition. 
# The input dataframe must have at least 5 columns.
mf1.fill_IterativeSVD()

[IterativeSVD] Iter 1: observed MAE=2.198553
[IterativeSVD] Iter 2: observed MAE=0.342949
[IterativeSVD] Iter 3: observed MAE=0.000559


Unnamed: 0,A,B,C,D,E
0,0.0,1.0,1.8377,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,10.709216,12.0,13.0,14.0
3,11.556256,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,12.784147


In [16]:
# A strategy for imputing missing values by modeling each feature with 
# missing values as a function of other features in a round-robin fashion.
# Same as MICE (Multiple Imputation by  chained equations) in R.
mf1.fill_IterativeImputer()

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,24.0


In [18]:
# Direct factorization of the incomplete matrix into low-rank U and V, 
# with per-row and per-column biases, as well as a global bias.
mf1.fill_MatrixFactorization()

[MatrixFactorization] Iter 10: observed MAE=4.061846 rank=40
[MatrixFactorization] Iter 20: observed MAE=1.204612 rank=40
[MatrixFactorization] Iter 30: observed MAE=0.481113 rank=40
[MatrixFactorization] Iter 40: observed MAE=0.331092 rank=40
[MatrixFactorization] Iter 50: observed MAE=0.225590 rank=40
[MatrixFactorization] Iter 60: observed MAE=0.153135 rank=40
[MatrixFactorization] Iter 70: observed MAE=0.104026 rank=40
[MatrixFactorization] Iter 80: observed MAE=0.071123 rank=40
[MatrixFactorization] Iter 90: observed MAE=0.050189 rank=40
[MatrixFactorization] Iter 100: observed MAE=0.035853 rank=40
[MatrixFactorization] Iter 110: observed MAE=0.028006 rank=40
[MatrixFactorization] Iter 120: observed MAE=0.023827 rank=40
[MatrixFactorization] Iter 130: observed MAE=0.020789 rank=40
[MatrixFactorization] Iter 140: observed MAE=0.018600 rank=40
[MatrixFactorization] Iter 150: observed MAE=0.017009 rank=40
[MatrixFactorization] Iter 160: observed MAE=0.015774 rank=40
[MatrixFactorizat

Unnamed: 0,A,B,C,D,E
0,0.0,1.0,2.377297,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.004485,12.0,13.0,14.0
3,15.029141,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,22.978613


In [19]:
# Missing value imputation using Random Forests.
mf1.fill_RF()

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


Unnamed: 0,A,B,C,D,E
0,0.0,1.0,8.5,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,9.9,12.0,13.0,14.0
3,14.8,16.0,17.0,18.0,19.0
4,20.0,21.0,22.0,23.0,17.15


In [5]:
#Original data
original_df = pd.read_csv("http://publicepidata.s3.amazonaws.com/GSE105018_N30_R10K.original.tsv", sep="\t", index_col=0, header=0)

# manually marked 10% values as missing (non-random block missing)
d5 = pd.read_csv("http://publicepidata.s3.amazonaws.com/GSE105018_N30_R10K.10per.tsv", sep="\t", index_col=0, header=0)
mf5 = MissFiller(d5)

# predict the missing values
filled_df = mf5.fill_more()
original_df

Binerize sample IDs into two groups using K-means ...
Group "0" contains 15 samples
	3442
	3311
	3312
	3341
	3342
	3411
	3412
	3461
	3462
	3471
	3472
	3501
	3502
	3451
	3452
Group "1" contains 15 samples
	2621
	2622
	2691
	2692
	3391
	3392
	2791
	2792
	3161
	3162
	3211
	3212
	3431
	3432
	3441
493 rows in group "0" are complete missing.
507 rows in group "1" are complete missing.
Predict missing values in group "0"
Iteration 0
The prediction score (coefficient of determination) is 99.10%
Iteration 1
The prediction score (coefficient of determination) is 99.15%
Iteration 2
The prediction score (coefficient of determination) is 99.00%
Iteration 3
The prediction score (coefficient of determination) is 98.89%
Iteration 4
The prediction score (coefficient of determination) is 98.93%
Iteration 5
The prediction score (coefficient of determination) is 99.04%
Iteration 6
The prediction score (coefficient of determination) is 98.96%
Iteration 7
The prediction score (coefficient of determination) 

In [6]:
# the locations of 10% NA
na_locations = mf5.get_na_indices()

# calculate the imputation perfromance
MAE, RAE, RMSE, MAPE, R2 = calculate_metrics(original_df, filled_df, na_locations)

In [7]:
MAE


0.020684513333333338