### 处理丢失数据
- 有两种丢失数据：
    - None
    - np.nan(NaN)

- 两种丢失数据的区别

In [27]:
import numpy as np
import pandas as pd
from pandas import DataFrame

- 为什么在数据分析中需要用到的是浮点类型的空而不是对象类型？
    - 数据分析中会常常使用某些形式的运算来处理原始数据，如果原数数据中的空值为NAN的形式，则不会干扰或者中断运算。
    - NAN可以参与运算的
    - None是不可以参与运算

In [2]:
type(None),type(np.nan)

(NoneType, float)

- 在pandas中如果遇到了None形式的空值则pandas会将其强转成NAN的形式。

In [6]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[2,3] = None
df.iloc[4,2] = np.nan
df.iloc[5,1] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,23,32.0,97.0,88.0,2,50,53
1,99,3.0,31.0,27.0,74,50,36
2,65,86.0,14.0,,0,6,30
3,91,51.0,94.0,87.0,29,29,23
4,29,78.0,,46.0,50,33,27
5,80,,40.0,63.0,28,81,58


### pandas处理空值操作
- isnull
- notnull
- any
- all
- dropna
- fillna

- 方式1：对空值进行过滤（删除空所在的行数据）
    - 技术：isnull，notnull，any，all

In [10]:
df.isnull()

Unnamed: 0,0,1,2,3,4,5,6
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False
5,False,True,False,False,False,False,False


In [9]:
df.isnull().any(axis=1) #any可以检测df中的true和false的分布，如果行/列中只要存在一个true，则any就会返回true

0    False
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [13]:
df.loc[~df.isnull().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6
0,23,32.0,97.0,88.0,2,50,53
1,99,3.0,31.0,27.0,74,50,36
3,91,51.0,94.0,87.0,29,29,23


In [14]:
#notnull
df.notnull()

Unnamed: 0,0,1,2,3,4,5,6
0,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,False,True,True,True
3,True,True,True,True,True,True,True
4,True,True,False,True,True,True,True
5,True,False,True,True,True,True,True


In [16]:
df.notnull().all(axis=1) #all可以检测df中的true和false的分布，如果行/列中全部都是true，则any就会返回true，否则返回false

0     True
1     True
2    False
3     True
4    False
5    False
dtype: bool

In [17]:
df.loc[df.notnull().all(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6
0,23,32.0,97.0,88.0,2,50,53
1,99,3.0,31.0,27.0,74,50,36
3,91,51.0,94.0,87.0,29,29,23


- 方式2：
    - dropna：可以直接将缺失的行或者列进行删除

In [19]:
df.dropna(axis=0) #drop系列的函数中0行1列

Unnamed: 0,0,1,2,3,4,5,6
0,23,32.0,97.0,88.0,2,50,53
1,99,3.0,31.0,27.0,74,50,36
3,91,51.0,94.0,87.0,29,29,23


- 对缺失值进行覆盖
    - fillna

In [22]:
df.fillna(value=666)

Unnamed: 0,0,1,2,3,4,5,6
0,23,32.0,97.0,88.0,2,50,53
1,99,3.0,31.0,27.0,74,50,36
2,65,86.0,14.0,666.0,0,6,30
3,91,51.0,94.0,87.0,29,29,23
4,29,78.0,666.0,46.0,50,33,27
5,80,666.0,40.0,63.0,28,81,58


In [26]:
df.fillna(axis=1,method='bfill')

Unnamed: 0,0,1,2,3,4,5,6
0,23.0,32.0,97.0,88.0,2.0,50.0,53.0
1,99.0,3.0,31.0,27.0,74.0,50.0,36.0
2,65.0,86.0,14.0,0.0,0.0,6.0,30.0
3,91.0,51.0,94.0,87.0,29.0,29.0,23.0
4,29.0,78.0,46.0,46.0,50.0,33.0,27.0
5,80.0,40.0,40.0,63.0,28.0,81.0,58.0


### 面试题
- 数据说明： 
    - 数据是1个冷库的温度数据，1-7对应7个温度采集设备，1分钟采集一次。

- 数据处理目标：
    - 用1-4对应的4个必须设备，通过建立冷库的温度场关系模型，预估出5-7对应的数据。
    - 最后每个冷库中仅需放置4个设备，取代放置7个设备。
    - f(1-4) --> y(5-7)

- 数据处理过程：
    - 1、原始数据中有丢帧现象，需要做预处理；
    - 2、matplotlib 绘图；
    - 3、建立逻辑回归模型。

- 无标准答案，按个人理解操作即可，请把自己的操作过程以文字形式简单描述一下，谢谢配合。

- 测试数据为testData.xlsx


In [29]:
df = pd.read_excel('data/testData.xlsx')
df

Unnamed: 0,time,none,1,2,3,4,none1,5,6,7
0,2019-01-27 17:00:00,,-24.8,-18.2,-20.8,-18.8,,,,
1,2019-01-27 17:01:00,,-23.5,-18.8,-20.5,-19.8,,-15.2,-14.5,-16.0
2,2019-01-27 17:02:00,,-23.2,-19.2,,,,-13.0,,-14.0
3,2019-01-27 17:03:00,,-22.8,-19.2,-20.0,-20.5,,,-12.2,-9.8
4,2019-01-27 17:04:00,,-23.2,-18.5,-20.0,-18.8,,-10.2,-10.8,-8.8
...,...,...,...,...,...,...,...,...,...,...
1055,2019-01-28 10:35:00,,-26.2,-27.2,-28.8,-27.5,,-2.0,,-5.0
1056,2019-01-28 10:36:00,,-26.8,-27.5,-29.0,-27.8,,-2.2,,-5.0
1057,2019-01-28 10:37:00,,-27.2,-27.8,-29.0,-28.0,,-2.2,,-5.0
1058,2019-01-28 10:38:00,,-27.5,-27.0,-29.0,-28.0,,-3.5,-3.2,-5.8


In [31]:
df.drop(labels=['none','none1'],axis=1,inplace=True)

In [35]:
#存在缺失数据的行数
df.isnull().any(axis=1).sum()

133

In [36]:
df.dropna(axis=0)

Unnamed: 0,time,1,2,3,4,5,6,7
1,2019-01-27 17:01:00,-23.5,-18.8,-20.5,-19.8,-15.2,-14.5,-16.0
4,2019-01-27 17:04:00,-23.2,-18.5,-20.0,-18.8,-10.2,-10.8,-8.8
7,2019-01-27 17:07:00,-24.8,-18.0,-17.5,-17.2,-14.2,-14.0,-12.5
10,2019-01-27 17:10:00,-24.5,-18.5,-16.0,-18.5,-17.5,-16.5,-17.2
15,2019-01-27 17:15:00,-23.5,-17.8,-15.0,-18.0,10.5,10.5,10.8
...,...,...,...,...,...,...,...,...
1051,2019-01-28 10:31:00,-24.0,-24.8,-27.8,-25.5,-2.0,-2.0,-5.8
1052,2019-01-28 10:32:00,-24.2,-25.5,-28.0,-26.0,-2.0,-2.0,-5.5
1053,2019-01-28 10:33:00,-25.0,-26.2,-28.2,-26.8,-2.0,-2.0,-5.2
1054,2019-01-28 10:34:00,-25.8,-26.8,-28.5,-27.0,-2.0,-2.2,-5.2


In [39]:
#填充
df.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)

Unnamed: 0,time,1,2,3,4,5,6,7
0,2019-01-27 17:00:00,-24.8,-18.2,-20.8,-18.8,-15.2,-14.5,-16.0
1,2019-01-27 17:01:00,-23.5,-18.8,-20.5,-19.8,-15.2,-14.5,-16.0
2,2019-01-27 17:02:00,-23.2,-19.2,-20.5,-19.8,-13.0,-14.5,-14.0
3,2019-01-27 17:03:00,-22.8,-19.2,-20.0,-20.5,-13.0,-12.2,-9.8
4,2019-01-27 17:04:00,-23.2,-18.5,-20.0,-18.8,-10.2,-10.8,-8.8
...,...,...,...,...,...,...,...,...
1055,2019-01-28 10:35:00,-26.2,-27.2,-28.8,-27.5,-2.0,-2.2,-5.0
1056,2019-01-28 10:36:00,-26.8,-27.5,-29.0,-27.8,-2.2,-2.2,-5.0
1057,2019-01-28 10:37:00,-27.2,-27.8,-29.0,-28.0,-2.2,-2.2,-5.0
1058,2019-01-28 10:38:00,-27.5,-27.0,-29.0,-28.0,-3.5,-3.2,-5.8


- 使用列的均值填充缺失值

In [40]:
df = DataFrame(data=np.random.randint(0,100,size=(6,7)))
df.iloc[2,3] = None
df.iloc[4,2] = np.nan
df.iloc[5,1] = None
df

Unnamed: 0,0,1,2,3,4,5,6
0,12,82.0,26.0,42.0,55,36,9
1,88,96.0,37.0,68.0,13,45,94
2,14,3.0,68.0,,18,48,19
3,29,93.0,5.0,8.0,93,80,18
4,43,20.0,,35.0,15,87,13
5,21,,66.0,6.0,38,21,52


In [41]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        #df[col]列中存在空值
        mean_value = df[col].mean()
        df[col].fillna(value=mean_value,inplace=True)

In [42]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,12,82.0,26.0,42.0,55,36,9
1,88,96.0,37.0,68.0,13,45,94
2,14,3.0,68.0,31.8,18,48,19
3,29,93.0,5.0,8.0,93,80,18
4,43,20.0,40.4,35.0,15,87,13
5,21,58.8,66.0,6.0,38,21,52


### 处理重复数据

### 处理异常数据
- 自定义一个1000行3列（A，B，C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗