In [1]:
%matplotlib inline

## Data Cleaning: handling of missing values, outliers, and duplicate values
### 1.1 Missing values

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer 

In [3]:
# produce some missing vlaues
df = pd.DataFrame(np.random.randn(6, 4), columns=['col1', 'col2', 'col3', 'col4'])
df.iloc[1:2,1] = np.nan
df.iloc[4,3] = np.nan
print(df)

       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
1 -0.942474       NaN -1.488788 -0.436018
2  0.105190  2.187603 -0.084410  0.015061
3 -1.611349 -0.961022 -0.440345 -0.477986
4  1.578641  0.804782  0.316604       NaN
5 -0.596343  0.527066 -0.028920 -1.894049


In [10]:
# find missing values
nan_all = df.isnull()

# colume with NaN
nan_col1 = df.isnull().any()
# colume with all NaN
nan_col2 = df.isnull().all()
print(nan_all)
print(nan_col1)
print(nan_col2)

# discard row with NaN
df_without_nan = df.dropna()
print(df_without_nan)

# use sklearn to deal with nan. use mean value of column to fill in NaN
# Class Imputer is deprecated; Import impute.SimpleImputer from sklearn instead.
# nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0)
nan_model = SimpleImputer(missing_values=np.nan, strategy='mean')
nan_result = nan_model.fit_transform(df)
print(nan_result)

    col1   col2   col3   col4
0  False  False  False  False
1  False   True  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False   True
5  False  False  False  False
col1    False
col2     True
col3    False
col4     True
dtype: bool
col1    False
col2    False
col3    False
col4    False
dtype: bool
       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
2  0.105190  2.187603 -0.084410  0.015061
3 -1.611349 -0.961022 -0.440345 -0.477986
5 -0.596343  0.527066 -0.028920 -1.894049
[[ 1.30839668 -0.34735768 -0.95050151  0.51889445]
 [-0.94247355  0.44221421 -1.48878795 -0.4360184 ]
 [ 0.10519015  2.18760263 -0.08441017  0.01506075]
 [-1.61134911 -0.9610216  -0.44034505 -0.47798575]
 [ 1.57864123  0.80478201  0.31660423 -0.45481949]
 [-0.59634302  0.52706567 -0.02891958 -1.89404852]]


In [12]:
# use pandas to deal with missing values
nan_result_pd1 = df.fillna(method='backfill')    # use back value to fill in NaN
nan_result_pd2 = df.fillna(method='bfill', limit=1)    # use back value,limit one value in every column
nan_result_pd3 = df.fillna(method='pad')     # use front value
nan_result_pd4 = df.fillna(0)    # use 0 to fill in nan
nan_result_pd5 = df.fillna({ 'col2': 1.1, 'col4': 1.2})   # use specified value to fill in nan
nan_result_pd6 = df.fillna(df.mean()['col2':'col4'])   # use mean value of column to fill in nan

print(nan_result_pd1)
print(nan_result_pd2)
print(nan_result_pd3)
print(nan_result_pd4)
print(nan_result_pd5)
print(nan_result_pd6)

       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
1 -0.942474  2.187603 -1.488788 -0.436018
2  0.105190  2.187603 -0.084410  0.015061
3 -1.611349 -0.961022 -0.440345 -0.477986
4  1.578641  0.804782  0.316604 -1.894049
5 -0.596343  0.527066 -0.028920 -1.894049
       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
1 -0.942474  2.187603 -1.488788 -0.436018
2  0.105190  2.187603 -0.084410  0.015061
3 -1.611349 -0.961022 -0.440345 -0.477986
4  1.578641  0.804782  0.316604 -1.894049
5 -0.596343  0.527066 -0.028920 -1.894049
       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
1 -0.942474 -0.347358 -1.488788 -0.436018
2  0.105190  2.187603 -0.084410  0.015061
3 -1.611349 -0.961022 -0.440345 -0.477986
4  1.578641  0.804782  0.316604 -0.477986
5 -0.596343  0.527066 -0.028920 -1.894049
       col1      col2      col3      col4
0  1.308397 -0.347358 -0.950502  0.518894
1 -0.942474  0.000000 -1.488788 -0

### 1.2 Outliers
##### Standardization of Z value as a criterion

In [13]:
df_outlier = pd.DataFrame({'col1': [1, 120, 3, 5, 2, 12, 13],
                   'col2': [12, 17, 31, 53, 22, 32, 43]})
print(df_outlier)

   col1  col2
0     1    12
1   120    17
2     3    31
3     5    53
4     2    22
5    12    32
6    13    43


In [15]:
# Z-Score
df_zscore = df_outlier.copy()
cols = df_outlier.columns
for col in cols:
    df_col = df_outlier[col]
    z_score = (df_col - df_col.mean()) / df_col.std()
    df_zscore[col] = z_score.abs() > 2.2

print(df_zscore)

    col1   col2
0  False  False
1   True  False
2  False  False
3  False  False
4  False  False
5  False  False
6  False  False


### 2.1 Convert categorical and sequential data to marker variables

In [16]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [17]:
df_convert = pd.DataFrame({'id': [3566841, 6541227, 3512441],
                   'sex': ['male', 'Female', 'Female'],
                   'level': ['high', 'low', 'middle'],
                   'score': [1, 2, 3]})
print(df_convert) 

        id     sex   level  score
0  3566841    male    high      1
1  6541227  Female     low      2
2  3512441  Female  middle      3


In [21]:
# use sklearn to convert mark variables. split ID column and data columns
id_data = df_convert[['id']]
raw_convert_data = df_convert.iloc[:, 1:]
# print(raw_convert_data)

# Dumb coding, marker changed
model_enc = OneHotEncoder()
df_convert_new = model_enc.fit_transform(raw_convert_data).toarray()
df_all = pd.concat((id_data, pd.DataFrame(df_convert_new)), axis=1)
print(df_all)

        id    0    1    2    3    4    5    6    7
0  3566841  0.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0
1  6541227  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0
2  3512441  1.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0


In [22]:
df_new = pd.get_dummies(raw_convert_data)
df_all2 = pd.concat((id_data, pd.DataFrame(df_new)), axis=1)
print(df_all2)

        id  score  sex_Female  sex_male  level_high  level_low  level_middle
0  3566841      1           0         1           1          0             0
1  6541227      2           1         0           0          1             0
2  3512441      3           1         0           0          0             1
