In [1]:
import numpy as np

In [2]:
import pandas as pd

In [7]:
arr = np.array([[1, np.nan, 3], [4, 5, np.nan]])

# 1 isna判断每个元素是否为NaN

In [5]:
pd.isna(arr)

AttributeError: module 'pandas' has no attribute 'isna'

In [6]:
pd.__version__

'0.20.3'

### 注意： pandas版本0.20.3没有函数isna()

# 2 使用相同功能的函数：isnull()

In [9]:
pd.isnull(arr)

array([[False,  True, False],
       [False, False,  True]], dtype=bool)

In [10]:
index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
...                           "2017-07-08"])

In [11]:
index

DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], dtype='datetime64[ns]', freq=None)

In [12]:
pd.isnull(index)

array([False, False,  True, False], dtype=bool)

In [13]:
df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])

In [14]:
df

Unnamed: 0,0,1,2
0,ant,bee,cat
1,dog,,fly


In [15]:
pd.isnull(df)

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,False


In [17]:
df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', np.nan, 'fly']])

In [18]:
pd.isnull(df)

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,False


# 3 相反功能 notnull()

In [21]:
pd.notnull(df)

Unnamed: 0,0,1,2
0,True,True,True
1,True,False,True


# 4 missing数据进阶

In [23]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
   ...:                   columns=['one', 'two', 'three'])
   ...: 

In [24]:
df

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
c,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [27]:
df = df.reindex(['a','b','c','d','e','f','g','h']) # 重置索引，df原来无这个索引的话，会默认值都为NaN

In [28]:
df

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


In [29]:
df['one']==np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

In [32]:
df['one']==None

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

### 在判断某列、某行是否出现missing data时，以上两种方法都不可取！！！！

### 正确方法，还是必须要使用isnull() , notnull()

In [34]:
pd.isnull(df['one']) # It's ok!

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

# 5 填充空值fillna

In [51]:
df

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


In [52]:
df.fillna(0.0) #返回的结果df，原来的NaN都变为0.0

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,0.0,0.0,0.0
c,1.430349,-1.511313,-0.735181
d,0.0,0.0,0.0
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,0.0,0.0,0.0
h,0.61274,-1.549453,-0.828304


In [54]:
df #注意我们没有把 df.fillna(0.0)赋值给df，所以df还像原来那样有NaN

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


In [53]:
df.fillna(method='pad') # 观察参数 pad 方法实现的功能：传播上一个有效值到下一个有效值

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,0.581697,1.238685,-0.726714
c,1.430349,-1.511313,-0.735181
d,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [56]:
# 确认一下： 
help(df.fillna)

Help on method fillna in module pandas.core.frame:

fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs) method of pandas.core.frame.DataFrame instance
    Fill NA/NaN values using the specified method
    
    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame). (values not
        in the dict/Series/DataFrame will not be filled). This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use NEXT valid observation to fill gap
    axis : {0 or 'index', 1 or 'columns'}
    inplace : boolean, default False
        If True, fil

In [58]:
df2 = df.reindex(['a','a1','b','c','d','e','f','g','h'])

In [59]:
df2

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
a1,,,
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


In [60]:
df2.fillna(method='pad')

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
a1,0.581697,1.238685,-0.726714
b,0.581697,1.238685,-0.726714
c,1.430349,-1.511313,-0.735181
d,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [61]:
df2.loc['a',:]=np.nan  # 如果上一个有效值为NaN,则传播到下一个有效值时也为NaN

In [63]:
df2.fillna(method='pad')

Unnamed: 0,one,two,three
a,,,
a1,,,
b,,,
c,1.430349,-1.511313,-0.735181
d,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [64]:
df

Unnamed: 0,one,two,three
a,0.581697,1.238685,-0.726714
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


In [66]:
df2.loc['a',:] = 1.0

In [68]:
df2.fillna(method='pad',limit=1)  # 关键字limit的意义所在

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
a1,1.0,1.0,1.0
b,,,
c,1.430349,-1.511313,-0.735181
d,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [72]:
df2

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
a1,,,
b,,,
c,1.430349,-1.511313,-0.735181
d,,,
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
g,,,
h,0.61274,-1.549453,-0.828304


# 6 丢弃空数据 dropna

In [73]:
df2.dropna(axis=0)

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
c,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [74]:
help(df2.dropna)

Help on method dropna in module pandas.core.frame:

dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) method of pandas.core.frame.DataFrame instance
    Return object with labels on given axis omitted where alternately any
    or all of the data are missing
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof
        Pass tuple or list to drop on multiple axes
    how : {'any', 'all'}
        * any : if any NA values are present, drop that label
        * all : if all values are NA, drop that label
    thresh : int, default None
        int value : require that many non-NA values
    subset : array-like
        Labels along other axis to consider, e.g. if you are dropping rows
        these would be a list of columns to include
    inplace : boolean, default False
        If True, do operation inplace and return None.
    
    Returns
    -------
    dropped : DataFrame
    
    Examples
    --------
    >>> df = pd.DataFr

In [80]:
df2.loc['a1','one'] = 2.0

In [82]:
df2.dropna(axis=0, how='any')

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
c,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304


In [83]:
df2.dropna(axis=0, how='all')

Unnamed: 0,one,two,three
a,1.0,1.0,1.0
a1,2.0,,
c,1.430349,-1.511313,-0.735181
e,-0.688284,-0.41915,0.07177
f,-1.377813,0.644592,-0.218164
h,0.61274,-1.549453,-0.828304
