In [2]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from lib import plot_decision_regions

In [3]:
import pandas as pd

In the example below, we simulate reading a dataset with missing values

In [4]:
from io import StringIO
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0'''
df = pd.read_csv(StringIO(csv_data))

# empty values replaced with NaN
print("Data set with empty values")
print(df)
print()

# number of empty values in each column
print("No. of empty values in each column")
print(df.isnull().sum())
print()

# access underlying numpy array
print("Underlying numpy array")
print(df.values)

Data set with empty values
     A     B     C    D
0  1.0   2.0   3.0  4.0
1  5.0   6.0   NaN  8.0
2  0.0  11.0  12.0  NaN

No. of empty values in each column
A    0
B    0
C    1
D    1
dtype: int64

Underlying numpy array
[[  1.   2.   3.   4.]
 [  5.   6.  nan   8.]
 [  0.  11.  12.  nan]]


## eliminating samples or features with missing values

In [5]:
# dropping rows with missing rows
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
# dropping columns with at least one NaN in any row
df.dropna(axis = 1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [7]:
# only drop rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [8]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
# only drom columns where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


## imputing missing values

One of the most common interpolation techniques is
**mean imputation** where we simply replace the missing value
by the mean value of the entire column.

In [15]:
# use the Imputer class from sklearn to perform mean imputation
from sklearn.preprocessing import Imputer
# axis=0 for columns (axis=1 for rows)
# other strategies: most_frequent, median
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
# the mean is separately calculated for each column
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])