In [2]:
import numpy as np
import pandas as pd
from ultra_impute import MissFiller

In [11]:
# d1 is dictionary
d1 = {
'A':[0, 5, 10, 15, 20],
'B': [1, 6, np.nan, 16, 21],
'C':[np.nan, 7, 12, 17, 22],
'D':[3, 8, 13, 18, 23]
}

# d2 is ndarray
d2 = np.array([
[ 0.,  1., np.nan,  3.],
[ 5.,  6.,  7.,  8.],
[10., np.nan, 12., 13.],
[15., 16., 17., 18.],
[20., 21., 22., 23.]
])

#d3 and d4 are DataFrames created from d1 and d2, respectively.
d3= pd.DataFrame(d1)
d4 = pd.DataFrame(d2, columns=['A','B ','C','D'])

#create an MissFiller object and check the number of missing values. 
#input can be dictionalry, np.array or pd.DataFrame
mf = MissFiller(d1)
mf = MissFiller(d2)
mf = MissFiller(d3)
mf = MissFiller(d4)
mf.df

Unnamed: 0,A,B,C,D
0,0.0,1.0,,3.0
1,5.0,6.0,7.0,8.0
2,10.0,,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [5]:
# remove rows with any missing values
mf.remove_na()

Unnamed: 0,A,B,C,D
1,5,6.0,7.0,8
3,15,16.0,17.0,18
4,20,21.0,22.0,23


In [10]:
# remove columns with any missing values
mf.remove_na(axis=1)

Unnamed: 0,A,D
0,0.0,3.0
1,5.0,8.0
2,10.0,13.0
3,15.0,18.0
4,20.0,23.0


In [16]:
# fill missing values with row mean
mf.fill_trend(axis=1, method='mean')

Unnamed: 0,A,B,C,D
0,0.0,1.0,1.333333,3.0
1,5.0,6.0,7.0,8.0
2,10.0,11.666667,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [17]:
# fill missing values with column mean
mf.fill_trend(axis=0, method='mean')

Unnamed: 0,A,B,C,D
0,0.0,1.0,14.5,3.0
1,5.0,6.0,7.0,8.0
2,10.0,11.0,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [19]:
# back fill missing values by row
# When axis=1, "back fill" means a missing value will be filled by the value on its rightside
mf.fill_trend(axis=1, method='bfill')

Unnamed: 0,A,B,C,D
0,0.0,1.0,3.0,3.0
1,5.0,6.0,7.0,8.0
2,10.0,12.0,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [20]:
# back fill missing values by row
# When axis=0, "back fill" means a missing value will be filled by the value below.
mf.fill_trend(axis=0, method='bfill')

Unnamed: 0,A,B,C,D
0,0.0,1.0,7.0,3.0
1,5.0,6.0,7.0,8.0
2,10.0,16.0,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [21]:
# Fill by random values chose from the same column
mf.df
mf.fill_rand(axis=0)

Unnamed: 0,A,B,C,D
0,0.0,1.0,7.0,3.0
1,5.0,6.0,7.0,8.0
2,10.0,6.0,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [22]:
# Fill by random values chose from the same row
mf.df
mf.fill_rand(axis=1)

Unnamed: 0,A,B,C,D
0,0.0,1.0,0.0,3.0
1,5.0,6.0,7.0,8.0
2,10.0,12.0,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [24]:
#fill missing values by mean calculated from the sliding window. Default axis = 0, means the sliding window will move along the columns.
#
mf.fill_mw()

Unnamed: 0,A,B,C,D
0,0.0,1.0,9.5,3.0
1,5.0,6.0,7.0,8.0
2,10.0,7.666667,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [25]:
# fill missing values using the 'fKNN' algorithem (fast k-nearest neighbour).
# When axis = 1 (default), search columns for the KNN
mf.fill_fKNN()

Unnamed: 0,A,B,C,D
0,0.0,1.0,1.473573,3.0
1,5.0,6.0,7.0,8.0
2,10.0,11.535419,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [26]:
# fill missing values using the 'fKNN' algorithem (fast k-nearest neighbour).
# When axis = 0, search rows for the KNN
mf.fill_fKNN(axis=0)

Unnamed: 0,A,B,C,D
0,0.0,1.0,9.510232,3.0
1,5.0,6.0,7.0,8.0
2,10.0,9.596187,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0


In [28]:
# fill missing values using the sklearn's 'KNNImputer'
# When axis = 1 (default), search columns for the KNN
mf.fill_KNN()

Unnamed: 0,A,B,C,D
0,0.0,1.0,1.333333,3.0
1,5.0,6.0,7.0,8.0
2,10.0,11.666667,12.0,13.0
3,15.0,16.0,17.0,18.0
4,20.0,21.0,22.0,23.0
