In [1]:
import pandas as pd
import numpy as np


In [2]:
string = pd.Series(["aardsd","sdsd",np.nan,"pop"])
string
string.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [3]:
# Python内置的None值在对象数组中也可以作为NA
string[0]=None
string.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [4]:
string.dropna()


1    sdsd
3     pop
dtype: object

In [5]:
string.fillna(string[1])

0    sdsd
1    sdsd
2    sdsd
3     pop
dtype: object

In [9]:
string[string.isnull()]

0    None
2     NaN
dtype: object

对于DataFrame对象，事情就有点复杂了。你可能希望丢弃全NA或含有NA的行
或列。dropna默认丢弃任何含有缺失值的行,传入how='all'将只丢弃全为NA的那些行
用这种方式丢弃列，只需传入axis=1即可

用thresh参数实现 删除指定数量含有缺失值的行

In [17]:
data = pd.DataFrame([[1,2,3,4],[1,np.nan,3,4],[4,np.nan,np.nan,4],[np.nan,np.nan,np.nan,np.nan]])
data

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,1.0,,3.0,4.0
2,4.0,,,4.0
3,,,,


In [19]:
data.dropna()


Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0


In [21]:
data.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,4.0
1,1.0,,3.0,4.0
2,4.0,,,4.0


# 填充缺失数据

In [30]:
df = pd.DataFrame(np.random.randn(5,5))
df.iloc[:2,2]=np.nan
df.iloc[:4,1] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,-0.524352,,,1.571328,0.22544
1,-0.048178,,,-0.073556,-0.085095
2,-1.442627,,-0.333193,-1.250007,-0.841395
3,0.166565,,1.672187,0.051875,-0.464927
4,-0.754842,0.327957,-1.366444,0.574511,0.537545


In [32]:
# 通过一个字典调用fillna，就可以实现对不同的列填充不同的值
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2,3,4
0,-0.524352,0.5,0.0,1.571328,0.22544
1,-0.048178,0.5,0.0,-0.073556,-0.085095
2,-1.442627,0.5,-0.333193,-1.250007,-0.841395
3,0.166565,0.5,1.672187,0.051875,-0.464927
4,-0.754842,0.327957,-1.366444,0.574511,0.537545


In [49]:
# fillna默认会返回新对象，但也可以对现有对象进行就地修改
_ = df.fillna(0,inplace=True)
df

Unnamed: 0,0,1,2,3,4
0,-0.524352,0.0,0.0,1.571328,0.22544
1,-0.048178,0.0,0.0,-0.073556,-0.085095
2,-1.442627,0.0,-0.333193,-1.250007,-0.841395
3,0.166565,0.0,1.672187,0.051875,-0.464927
4,-0.754842,0.327957,-1.366444,0.574511,0.537545


In [51]:
df.iloc[1:2,2]=np.nan
df.iloc[1:4,1] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,-0.524352,0.0,0.0,1.571328,0.22544
1,-0.048178,,,-0.073556,-0.085095
2,-1.442627,,-0.333193,-1.250007,-0.841395
3,0.166565,,1.672187,0.051875,-0.464927
4,-0.754842,0.327957,-1.366444,0.574511,0.537545


In [54]:
df.fillna(method='ffill',limit=2)
#limit限制填充的行数

Unnamed: 0,0,1,2,3,4
0,-0.524352,0.0,0.0,1.571328,0.22544
1,-0.048178,0.0,0.0,-0.073556,-0.085095
2,-1.442627,0.0,-0.333193,-1.250007,-0.841395
3,0.166565,,1.672187,0.051875,-0.464927
4,-0.754842,0.327957,-1.366444,0.574511,0.537545


# 数据转换

In [59]:
data = pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1,1,2,3,3,4,4]})
data


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [61]:
# DataFrame的duplicated方法返回一个布尔型Series，表示各行是否是重复行
# duplicated和drop_duplicates默认保留的是第一个出现的值组合。传入keep='last'则保留最后一个
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [63]:
data.drop_duplicates()
# 删除重复行

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [66]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [68]:
data.drop_duplicates('k1')


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


### 利用函数或映射进行数据转换