In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame,Series

# 源数据中存在缺失值（空值）
- 重复值
- 异常值
### 处理丢失数据
- 有两种丢失数据
    - None
    - np.nan(NaN)
- 两种丢失数据的区别
    - nan可以参与运算
    - None是不可以参与预算
- 为什么在数据分析中需要用到的是浮点型的空而不是对象类型？
    - 数据分析中通常使用某些形式的运算来处理原始数据，如果原始数据中的空值为nan的形式，则不会干扰或者中断运算。
    - nan可以参与运算
    - None是不可以参与预算
- 在pandas中如果遇到了None形式空值，则pandas会将其强转为NAN形式
# 面试题
- 数据说明：
    - 数据时1个类库的温度数据，1-7对应7个温度采集设备，1分钟采集一次
- 数据处理目标：
    - 用1-4对应的4个必须设备，通过建立冷库的温度关系模型，预估出5-7对应的数据
    - 最后每个冷库中仅需防止4个设备，取代防止7个设备。
    - f(1-4) -->f(5-7)
- 数据处理过程：
    - 1.源数据中有丢帧现象，需要做预处理
    - 2.matplotlib绘图
    - 3.建立逻辑回归模型

In [2]:
type(None) # None对象类型

NoneType

In [4]:
np.nan
type(np.nan) # nan浮点型

float

# pandas处理空值操作
- isnulll  通常搭配 any
- notnull  通常搭配 all
- any  用来检测行或者列中是否存在true
- all
- dropna  可以直接将缺失的行或者列进行删除
- fillna  对缺失值进行覆盖

### 过滤
- 方式1:对空值进行过滤（删除空值所在的行数据）
    - 技术：isnull ->any ,notnull -> all
- 方式2:dropna 可以直接将缺失的行或者列进行删除

### 处理重复数据
- drop_duplicates()
### 处理异常数据
- 自定义一个1000行3列的（A,B,C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗

In [7]:
# 创建一组数据（存在空值）
df=DataFrame(data=np.random.randint(0,100,size=(8,6)))
df.iloc[2,3] = None
df.iloc[4,4] = np.nan
df.iloc[5,2] = None
df

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,92,37,48.0,62.0,35.0,28
2,33,54,48.0,,63.0,67
3,21,24,85.0,11.0,55.0,98
4,89,6,69.0,2.0,,57
5,49,38,,34.0,65.0,44
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [11]:
df.isnull()
df.notnull()

0     True
1     True
2     True
3     True
4    False
5     True
dtype: bool

In [16]:
# 哪些行中存在空值
df.isnull().any(axis=1)
# 将上步的bool值作为行索引
df.loc[df.isnull().any(axis=1)] #true对应的行数据就是存在缺失值的行数据
drop_index = df.loc[df.isnull().any(axis=1)].index
type(drop_index)
df.drop(labels=drop_index,axis=0) # 将缺失值行进行删除

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,92,37,48.0,62.0,35.0,28
3,21,24,85.0,11.0,55.0,98
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [21]:
df.notnull().all(axis=1)
df.loc[df.notnull().all(axis=1)]

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,92,37,48.0,62.0,35.0,28
3,21,24,85.0,11.0,55.0,98
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [23]:
# dropna
df.dropna(axis=0)

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,92,37,48.0,62.0,35.0,28
3,21,24,85.0,11.0,55.0,98
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [25]:
# fillna 覆盖
# df.fillna(value=888)
df.fillna(method='ffill',axis=1) #使用水平方向的详情填充空值

Unnamed: 0,0,1,2,3,4,5
0,38.0,82.0,62.0,84.0,0.0,4.0
1,92.0,37.0,48.0,62.0,35.0,28.0
2,33.0,54.0,48.0,48.0,63.0,67.0
3,21.0,24.0,85.0,11.0,55.0,98.0
4,89.0,6.0,69.0,2.0,2.0,57.0
5,49.0,38.0,38.0,34.0,65.0,44.0
6,74.0,32.0,84.0,87.0,62.0,33.0
7,96.0,77.0,72.0,6.0,81.0,40.0


In [26]:
#处理重复数据
df.iloc[1]=[0,0,0,0,0,0]
df.iloc[3]=[0,0,0,0,0,0]
df.iloc[4]=[0,0,0,0,0,0]
df

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,0,0,0.0,0.0,0.0,0
2,33,54,48.0,,63.0,67
3,0,0,0.0,0.0,0.0,0
4,0,0,0.0,0.0,0.0,0
5,49,38,,34.0,65.0,44
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [27]:
df.dropna(axis=0).drop_duplicates(keep='first')

Unnamed: 0,0,1,2,3,4,5
0,38,82,62.0,84.0,0.0,4
1,0,0,0.0,0.0,0.0,0
6,74,32,84.0,87.0,62.0,33
7,96,77,72.0,6.0,81.0,40


In [29]:
#自定义一个1000行3列的（A,B,C）取值范围为0-1的数据源，然后将C列中的值大于其两倍标准差的异常值进行清洗
df = DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,0.921792,0.255321,0.450967
1,0.625739,0.887474,0.540765
2,0.330947,0.678183,0.218128
3,0.810170,0.549757,0.223335
4,0.583065,0.168550,0.180885
...,...,...,...
995,0.710978,0.043555,0.741188
996,0.806544,0.036332,0.668985
997,0.052685,0.435571,0.840105
998,0.954485,0.553167,0.237140


In [36]:
twice_std = df['C'].std() *2
df['C']>twice_std
df.loc[df['C']>twice_std]
df.loc[~(df['C']>twice_std)]

Unnamed: 0,A,B,C
0,0.921792,0.255321,0.450967
1,0.625739,0.887474,0.540765
2,0.330947,0.678183,0.218128
3,0.810170,0.549757,0.223335
4,0.583065,0.168550,0.180885
...,...,...,...
985,0.299665,0.313493,0.481953
988,0.654010,0.694061,0.558106
994,0.477444,0.055980,0.159227
998,0.954485,0.553167,0.237140
