## 处理 NaN 

In [1]:
import pandas as pd

In [2]:
items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes': 8, 'suits': 45}, 
         {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5, 'shirts': 2, 'shoes': 5, 'suits': 7}, 
         {'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes': 10}]
store_items = pd.DataFrame(items, index=['store1', 'store2', 'store3'])
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [3]:
# isnull() 方法
# 如果值为 NaN，布尔值就为真。Pandas 中，数字1表示逻辑真值，数字0表示逻辑假值
x = store_items.isnull()
print(x)

        bikes  glasses  pants  shirts  shoes  suits  watches
store1  False     True  False   False  False  False    False
store2  False    False  False   False  False  False    False
store3  False    False  False    True  False   True    False


In [4]:
# sum() 方法，使用一次来统计每一列 NaN 值的数量
x = store_items.isnull().sum()
print(x)
print()

# 使用 isnull() 和两次 sum() 方法来统计整个 DataFrame 中 NaN 值的数量
x = store_items.isnull().sum().sum()
print('The number of total NaN:', x)

bikes      0
glasses    1
pants      0
shirts     1
shoes      0
suits      1
watches    0
dtype: int64

The number of total NaN: 3


In [5]:
# 使用方法 count 来统计非 NaN 值的数量
y = store_items.count()
print(y)
y = store_items.count().sum()
print('The number of total Not-NaN:', y)

bikes      3
glasses    2
pants      3
shirts     2
shoes      3
suits      2
watches    3
dtype: int64
The number of total Not-NaN: 18


In [6]:
# 使用方法 dropna 来删除缺失值 NaN
# dropna 是在其他地方删除行和列的，不影响原 DataFrame。但是把参数 inplace 设为 True 就可以在原 DataFrame 上删除行和列
# 设axis参数为0删除任何有 NaN 值的行
store_items.dropna(axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store2,15,50.0,5,2.0,5,7.0,10


In [7]:
# 设axis参数为1删除任何有 NaN 值的列
store_items.dropna(axis=1)

Unnamed: 0,bikes,pants,shoes,watches
store1,20,30,8,35
store2,15,5,5,10
store3,20,30,10,35


In [8]:
# 修改前的 store_items
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [9]:
# 设axis参数为1删除任何有 NaN 值的列
# 设inplace参数为True，在原DataFrame上删除
store_items.dropna(axis=1, inplace=True)
store_items

Unnamed: 0,bikes,pants,shoes,watches
store1,20,30,8,35
store2,15,5,5,10
store3,20,30,10,35


In [10]:
# 修改后的 store_items
store_items

Unnamed: 0,bikes,pants,shoes,watches
store1,20,30,8,35
store2,15,5,5,10
store3,20,30,10,35


In [11]:
# fillna() 把所有 NaN 值填充成别的值
# 默认不会在原 DataFrame 里填充，除非inplace=True 会修改原先的 DataFrame
items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes': 8, 'suits': 45}, 
         {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants': 5, 'shirts': 2, 'shoes': 5, 'suits': 7}, 
         {'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes': 10}]
store_items = pd.DataFrame(items, index=['store1', 'store2', 'store3'])

# fillna(0) 把所有 NaN 值替换成0
store_items.fillna(0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,0.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,0.0,10,0.0,35


In [12]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [13]:
# 前向填充 ‘ffill’，以上一行或列的值来替换 NaN
# 用该行/该列的上/前一个值来填充相应的 NaN 值
# axis=0 表示用上一行的值来替换 NaN
store_items.fillna(method='ffill', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,2.0,10,7.0,35


In [14]:
# axis=1 表示用前一列的值来替换 NaN
store_items.fillna(method='ffill', axis=1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,20.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,30.0,10.0,10.0,35.0


In [15]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [16]:
# 后向填充 ‘backfill’，以上一行或列的值来替换 NaN
# 用该行/该列的上/前一个值来填充相应的 NaN 值
# axis=0 表示用上一行的值来替换 NaN
store_items.fillna(method='backfill', axis=0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,50.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [17]:
# axis=1 表示用后一列的值来替换 NaN
store_items.fillna(method='backfill', axis=1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,30.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,10.0,10.0,35.0,35.0


In [18]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [19]:
# 可以选择使用不同的插值方法替换 NaN 值。不会在原 DataFrame上修改
# 例如线性插值 .interpolate(method = 'linear', axis) 方法将通过 linear 插值使用沿着给定 axis 的值替换 NaN 值
store_items.interpolate(method = 'linear', axis = 0)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,2.0,10,7.0,35


In [20]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [21]:
store_items.interpolate(method = 'linear', axis = 1)

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20.0,25.0,30.0,15.0,8.0,45.0,35.0
store2,15.0,50.0,5.0,2.0,5.0,7.0,10.0
store3,20.0,4.0,30.0,20.0,10.0,22.5,35.0


In [22]:
store_items

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,,10,,35


In [24]:
# mean() 方法：用该列的非空值的平均值填充 NaN 值。
store_items.fillna(store_items.mean())

Unnamed: 0,bikes,glasses,pants,shirts,shoes,suits,watches
store1,20,27.0,30,15.0,8,45.0,35
store2,15,50.0,5,2.0,5,7.0,10
store3,20,4.0,30,8.5,10,26.0,35
