In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

In [2]:
#缺失值处理

In [3]:
df = pd.DataFrame(np.random.randn(6,4), columns=['col1','col2','col3','col4'])
print(df)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427 -0.738493  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591  0.324868
5  2.397195  1.957078  0.615738 -0.146799


In [4]:
#生成缺失的值
df.iloc[1:2, 1] = np.nan
df.iloc[4, 3] = np.nan
df.iloc[5, 1] = np.nan
print(df)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427       NaN  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591       NaN
5  2.397195       NaN  0.615738 -0.146799


In [5]:
nan_all = df.isnull()
print(nan_all)

    col1   col2   col3   col4
0  False  False  False  False
1  False   True  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False   True
5  False   True  False  False


In [6]:
nan_col1 = df.isnull().any() #获得含有NA的列
nan_col2 = df.isnull().all() #获得全部为NA的列
print(nan_col1)
print('=======================')
print(nan_col2)

col1    False
col2     True
col3    False
col4     True
dtype: bool
col1    False
col2    False
col3    False
col4    False
dtype: bool


In [7]:
#丢弃缺失的值
df2 = df.dropna()
print(df2)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916


In [8]:
#建立替换规则：将值为NaN的缺失值用均值代替
nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0)
nan_result = nan_model.fit_transform(df)
print(nan_result)

[[ 0.92066048 -0.33948615 -2.10535763 -1.10857872]
 [-1.08142684 -0.22062365  0.64212069 -0.41830474]
 [ 1.07750316 -0.19217392 -0.59686545 -0.73376822]
 [ 1.03611002  0.07760978 -0.42988158  0.88491647]
 [-0.80192276 -0.42844431  0.59559068 -0.3045069 ]
 [ 2.39719516 -0.22062365  0.61573834 -0.14679927]]


In [9]:
# 用后面的值替换为特定值
nan_result_pd1 = df.fillna(method='backfill')
print(nan_result_pd1)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427 -0.192174  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591 -0.146799
5  2.397195       NaN  0.615738 -0.146799


In [10]:
#用后面的值替换缺失值，限制每列只能替换一个缺失值
nan_result_pd2 = df.fillna(method='bfill',limit=1)
print(nan_result_pd2)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427 -0.192174  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591 -0.146799
5  2.397195       NaN  0.615738 -0.146799


In [11]:
#用前面的值替换缺失的值
nan_result_pd3 = df.fillna(method='pad')
print(nan_result_pd3)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427 -0.339486  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591  0.884916
5  2.397195 -0.428444  0.615738 -0.146799


In [12]:
#用0替换缺失的值
nan_result_pd4=df.fillna(0)
print(nan_result_pd4)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427  0.000000  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591  0.000000
5  2.397195  0.000000  0.615738 -0.146799


In [13]:
#用不同的值替换不同列的缺失值
nan_result_pd5=df.fillna({'col2':1.1, 'col4':1.2})
print(nan_result_pd5)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427  1.100000  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591  1.200000
5  2.397195  1.100000  0.615738 -0.146799


In [18]:
#用平均数替换，选择各自列的值替换缺失的值
nan_result_pd6=df.fillna(df.mean()['col2':'col4'])
print(nan_result_pd6)

       col1      col2      col3      col4
0  0.920660 -0.339486 -2.105358 -1.108579
1 -1.081427 -0.220624  0.642121 -0.418305
2  1.077503 -0.192174 -0.596865 -0.733768
3  1.036110  0.077610 -0.429882  0.884916
4 -0.801923 -0.428444  0.595591 -0.304507
5  2.397195 -0.220624  0.615738 -0.146799
