# 处理丢失数据

有两种丢失数据：
- None
- np.nan(NaN)

## 1. None

None是Python自带的，其类型为python object。因此，None不能参与到任何计算中。

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
#查看None的数据类型
type(None)

NoneType

In [3]:
None + 1

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

## 2. np.nan（NaN）

np.nan是浮点类型，能参与到计算中。但计算的结果总是NaN。

In [5]:
#查看np.nan的数据类型
type(np.nan)

float

In [6]:
np.nan + 1

nan

## 3. pandas中的None与NaN

### 1) pandas中None与np.nan都视作np.nan

创建DataFrame

In [12]:
df = DataFrame(data=np.random.randint(0,100,size=(10,12)))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,65,2,91,72,25,18,58,85,99,16,9
1,39,51,76,28,5,9,56,88,63,9,96,82
2,47,53,82,26,42,46,36,0,18,50,86,3
3,44,25,66,57,56,39,36,96,64,4,88,21
4,9,99,52,77,42,8,36,66,13,84,49,92
5,41,59,39,99,27,56,8,69,50,39,3,93
6,88,68,62,4,22,84,55,41,94,14,9,25
7,33,67,83,55,23,46,83,12,41,72,30,71
8,98,36,47,30,54,13,33,12,37,42,16,39
9,20,2,25,92,9,40,28,66,90,48,54,53


In [13]:
#将某些数组元素赋值为nan
df.iloc[1,1] = None
df.iloc[5,6] = None
df.iloc[3,5] = None
df.iloc[4,4] = None
df.iloc[2,8] = np.nan
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,65.0,2,91,72.0,25.0,18.0,58,85.0,99,16,9
1,39,,76,28,5.0,9.0,56.0,88,63.0,9,96,82
2,47,53.0,82,26,42.0,46.0,36.0,0,,50,86,3
3,44,25.0,66,57,56.0,,36.0,96,64.0,4,88,21
4,9,99.0,52,77,,8.0,36.0,66,13.0,84,49,92
5,41,59.0,39,99,27.0,56.0,,69,50.0,39,3,93
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


In [None]:
#将空对应的行删除

In [14]:
#1.空值检测
df.isnull()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False
5,False,False,False,False,False,False,True,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
df.isnull().any(axis=1)

0    False
1     True
2     True
3     True
4     True
5     True
6    False
7    False
8    False
9    False
dtype: bool

In [23]:
drop_index = df.loc[df.isnull().any(axis=1)].index

In [24]:
df.drop(labels=drop_index,axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,65.0,2,91,72.0,25.0,18.0,58,85.0,99,16,9
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


In [27]:
df.notnull().all(axis=1)

0     True
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool

In [28]:
df.loc[df.notnull().all(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,65.0,2,91,72.0,25.0,18.0,58,85.0,99,16,9
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


- 总结：
    - isnull():True空值  False非空
    - notnull(): True非空  False空
    
    - 如何检测df中哪些行中存在控制？
        - df.isnull().any(axis=1):  True行中存在空 ,  False行中不存在空
        - df.notnull().all(axis=1):  False行中存在空 ,  True行中不存在空

### 2) pandas处理空值操作

- ``isnull()``
- ``notnull()``
- ``dropna()``: 过滤丢失数据
- ``fillna()``: 填充丢失数据

In [3]:
#创建DataFrame，给其中某些元素赋值为nan



(1)判断函数
- ``isnull()``
- ``notnull()``

- df.notnull/isnull().any()/all()

In [4]:
#过滤df中的空值（只保留没有空值的行）


df.dropna() 可以选择过滤的是行还是列（默认为行）:axis中0表示行，1表示的列

In [30]:
df.dropna(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,65.0,2,91,72.0,25.0,18.0,58,85.0,99,16,9
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


(3) 填充函数 Series/DataFrame
- ``fillna()``:value和method参数

In [36]:
df.iloc[0,1] = None
df.iloc[2,1] = None
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,,2,91,72.0,25.0,18.0,58,85.0,99,16,9
1,39,,76,28,5.0,9.0,56.0,88,63.0,9,96,82
2,47,,82,26,42.0,46.0,36.0,0,,50,86,3
3,44,25.0,66,57,56.0,,36.0,96,64.0,4,88,21
4,9,99.0,52,77,,8.0,36.0,66,13.0,84,49,92
5,41,59.0,39,99,27.0,56.0,,69,50.0,39,3,93
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


In [38]:
df.fillna(method='bfill',axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,33,25.0,2,91,72.0,25.0,18.0,58,85.0,99,16,9
1,39,25.0,76,28,5.0,9.0,56.0,88,63.0,9,96,82
2,47,25.0,82,26,42.0,46.0,36.0,0,64.0,50,86,3
3,44,25.0,66,57,56.0,8.0,36.0,96,64.0,4,88,21
4,9,99.0,52,77,27.0,8.0,36.0,66,13.0,84,49,92
5,41,59.0,39,99,27.0,56.0,55.0,69,50.0,39,3,93
6,88,68.0,62,4,22.0,84.0,55.0,41,94.0,14,9,25
7,33,67.0,83,55,23.0,46.0,83.0,12,41.0,72,30,71
8,98,36.0,47,30,54.0,13.0,33.0,12,37.0,42,16,39
9,20,2.0,25,92,9.0,40.0,28.0,66,90.0,48,54,53


可以选择前向填充还是后向填充

method 控制填充的方式 bfill ffill

============================================

练习7：

1. 简述None与NaN的区别

2. 假设张三李四参加模拟考试，但张三因为突然想明白人生放弃了英语考试，因此记为None，请据此创建一个DataFrame,命名为ddd3

3. 老师决定根据用数学的分数填充张三的英语成绩，如何实现？
    用李四的英语成绩填充张三的英语成绩？

============================================