In [3]:
from pandas import Series, DataFrame, Index, MultiIndex
import pandas as pd
import numpy as np

## 合并数据集

* pandas.merge 可以根据一个或者多个键将不同的DataFrame中的行链接起来
* pandas.concat 可以沿着一条轴将多个对象堆叠在一起
* 实例方法combin_first可以将重复数据编结在一起，用一个对象的值填充另一个对象的值

### 数据库风格的DataFrame合并

In [4]:
df1 = DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"], "data1": range(7)})

In [5]:
df2 = DataFrame({"key": ["a", "b", "d"], "data2": range(3)})

In [6]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [7]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


这是一种多对一的合并

In [8]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


我们没有指明要用哪个列进行连接。如果没有指定，merge就会将重叠列的列明当做键。

不过最好显示的指定一下：

In [9]:
pd.merge(df1, df2, on="key")

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


如果两个对象的列名不同，也可以进行指定

In [10]:
df3 = DataFrame({"1key": ["b", "b", "a", "c", "a", "a", "b"], "data1": range(7)})

In [11]:
df4 = DataFrame({"rkey": ["a", "b", "d"], "data2": range(3)})

In [12]:
pd.merge(df3, df4, left_on="1key", right_on="rkey")

Unnamed: 0,1key,data1,data2,rkey
0,b,0,1,b
1,b,1,1,b
2,b,6,1,b
3,a,2,0,a
4,a,4,0,a
5,a,5,0,a


默认情况下，merge做的是 inner 连接，结果中的键是交集。

其他方式还有"left"、"right"以及"outer"

外连接取的是键的并集，组合了左连接和右连接的结果

In [13]:
pd.merge(df1, df2, how="outer")

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


多对多的合并一样很简单，无线额外工作。

In [14]:
df1 = DataFrame({"key": ['b', 'b', 'a', 'c', 'a', 'b'], "data1": range(6)})

In [15]:
df2 = DataFrame({"key": ["a", "b", "a", "b", "d"], "data2": range(5)})

In [16]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [17]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,a
3,3,b
4,4,d


In [18]:
pd.merge(df1, df2, on="key", how="left")

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,3.0
2,1,b,1.0
3,1,b,3.0
4,2,a,0.0
5,2,a,2.0
6,3,c,
7,4,a,0.0
8,4,a,2.0
9,5,b,1.0


多对多连接产生的是行的笛卡尔积

df1 有3个b行，右边df2有两个b，所以最终结果有6个b

连接方式只影响出现在结果中的键

In [19]:
pd.merge(df1, df2, how="inner")

Unnamed: 0,data1,key,data2
0,0,b,1
1,0,b,3
2,1,b,1
3,1,b,3
4,5,b,1
5,5,b,3
6,2,a,0
7,2,a,2
8,4,a,0
9,4,a,2


要根据多个键进行合并，传入一个由列名组成的列表即可

In [20]:
left = DataFrame({
    "key1": ["foo", "foo", "bar"],
    "key2": ["one", "two", "one"],
    "lval": [1,2,3]
})

In [21]:
right = DataFrame({
    "key1": ["foo", "foo", "bar", "bar"],
    "key2": ["one", "one", "one", "two"],
    "rval": [4,5,6,7,]
})

In [22]:
pd.merge(left, right, on=["key1", "key2"], how="outer")

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


结果中出现哪些键组合取决于合并方式。

可以这么理解：多个键形成一系列元组，并将其当做单个连接键。

对于重复列的处理，merge会有个suffixes选项，用于指定附加到左右两个DataFrame对象的重叠列名的字符串

In [23]:
pd.merge(left, right, on="key1")

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [24]:
pd.merge(left, right, on="key1", suffixes=("_left", "_right"))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### 索引上的合并

有时候，DataFrame的链接键位于索引上。

可以传入left_index=True或者right_index=True 或者两个都传

In [25]:
left1 = DataFrame({
    "key": ["a", "b", "a", "a", "b", "c"],
    "value": range(6)
})

In [26]:
right1 = DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])

In [27]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [28]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [29]:
pd.merge(left1, right1, left_on="key", right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


对于层次索引，就有点复杂了

In [30]:
lefth = DataFrame({
    "key1": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada"],
    "key2": [2000, 2001, 2002, 2001, 2002],
    "data": np.arange(5.)
})

In [31]:
righth = DataFrame(np.arange(12).reshape((6,2)),
                  index=[["Nevada", "Nevada", "Ohio", "Ohio","Ohio", "Ohio"],
                        [2001, 2000, 2000, 2000, 20001, 20002]],
                  columns=["event1", "event2"])

In [32]:
lefth

Unnamed: 0,data,key1,key2
0,0.0,Ohio,2000
1,1.0,Ohio,2001
2,2.0,Ohio,2002
3,3.0,Nevada,2001
4,4.0,Nevada,2002


In [33]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,20001,8,9
Ohio,20002,10,11


这种情况下，必须以列表的形式指明用做合并键的多个列

In [34]:
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True)

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4,5
0,0.0,Ohio,2000,6,7
3,3.0,Nevada,2001,0,1


In [35]:
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True, how="outer")

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4.0,5.0
0,0.0,Ohio,2000,6.0,7.0
1,1.0,Ohio,2001,,
2,2.0,Ohio,2002,,
3,3.0,Nevada,2001,0.0,1.0
4,4.0,Nevada,2002,,
4,,Nevada,2000,2.0,3.0
4,,Ohio,20001,8.0,9.0
4,,Ohio,20002,10.0,11.0


也可以同时使用双方的索引

In [36]:
left2 = DataFrame(np.arange(1,7).reshape((3,2)),
                 index=["a", "c", "e"],
                 columns=["Ohio", "Nevada"])

In [37]:
right2 = DataFrame(np.arange(7,15).reshape((4,2)),
                 index=["b", "c", "d", "e"],
                 columns=["Missouri", "Alabama"])

In [38]:
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [39]:
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [40]:
pd.merge(left2, right2, how="outer", left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


DataFrame还有个join实例方法，能够方便的实现按索引合并。

In [41]:
left2.join(right2, how="outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


join方法默认是左连接

还支持索引 跟调动者 的某个列进行连接

对于简单的索引合并，还可以向join传入一组DataFrame

In [42]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [43]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [44]:
left1.join(right1, on="key")

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [45]:
another = DataFrame([[7,8],[9,10],[11,12],[16,17]], index=["a", "c", "e", "f"], columns=["New York", "Oregon"])

In [46]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7,8
c,3,4,9.0,10.0,9,10
e,5,6,13.0,14.0,11,12


### 轴向连接

还有一种合并运算称为连接（concatenation）、绑定(binding)、堆叠(stacking)

In [47]:
arr = np.arange(12).reshape((3,4))

In [48]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [49]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

pandas的concat函数提供连接方式

三个没有重叠索引的Series

In [50]:
s1 = Series([0,1], index=["a", "b"])

In [51]:
s2 = Series([2,3,4], index=["c", "d", "e"])

In [52]:
s3 = Series([5,6], index=["f", "g"])

In [53]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

默认情况下，concat是在axis=0上工作的，最终产生一个新的Series

如果传入axis=1 则结果变为一个DataFrame

In [54]:
pd.concat([s1,s2,s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


这种情况下，另一轴没有重叠。传入join="inner"即可得到他们的并集

In [55]:
s4 = pd.concat([s1*5, s3]);s4

a    0
b    5
f    5
g    6
dtype: int64

In [56]:
pd.concat([s1, s4], axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,5
f,,5
g,,6


In [57]:
pd.concat([s1, s4], axis=1, join="inner")

Unnamed: 0,0,1
a,0,0
b,1,5


你可以通过join_axes指定要在其他轴上使用的索引

In [58]:
pd.concat([s1, s4], axis=1, join_axes=[["a", "c", "b", "e"]])

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,5.0
e,,


想要不参与连接的片段在结果中区分开，可以在连接轴上创建一个层次索引

In [59]:
result = pd.concat([s1,s2,s3], keys=["one", "two", "three"]);result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

如果沿着axis=1对series合并，则keys就会变为DataFrame的列头

In [60]:
pd.concat([s1,s2,s3], axis=1, keys=["one", "two", "three"])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


同样的逻辑对DataFrame对象也一样

In [61]:
df1 = DataFrame(np.arange(6).reshape((3,2)), index=["a", "b", "c"], columns=["one", "two"])

In [62]:
df2 = DataFrame(5+np.arange(4).reshape((2,2)), index=["a", "c"], columns=["three", "four"])

In [63]:
pd.concat([df1, df2], axis=1, keys=["leve1", "leve2"])

Unnamed: 0_level_0,leve1,leve1,leve2,leve2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [64]:
pd.concat([df1, df2])

Unnamed: 0,four,one,three,two
a,,0.0,,1.0
b,,2.0,,3.0
c,,4.0,,5.0
a,6.0,,5.0,
c,8.0,,7.0,


如果传入的是一个字典，则字典的键会被当做keys选项值

In [65]:
pd.concat({"level1": df1, "level2":df2}, axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


还有个问题，就是跟当前分析工作无关的DataFrame行索引。

只有传递ignore_index=True即可

In [66]:
df1 = DataFrame(np.random.randn(3,4), columns=list("abcd"))

In [67]:
df2 = DataFrame(np.random.randn(2,3), columns=list("bda"))

In [68]:
df1  

Unnamed: 0,a,b,c,d
0,-0.381166,0.854895,-1.916495,-0.33034
1,-0.249483,-0.528534,1.672201,-1.307664
2,2.504183,0.659567,-1.050165,-1.206636


In [69]:
df2

Unnamed: 0,b,d,a
0,0.030773,-1.063719,2.713183
1,0.403601,-1.242394,0.885463


In [70]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,-0.381166,0.854895,-1.916495,-0.33034
1,-0.249483,-0.528534,1.672201,-1.307664
2,2.504183,0.659567,-1.050165,-1.206636
3,2.713183,0.030773,,-1.063719
4,0.885463,0.403601,,-1.242394


In [71]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,-0.381166,0.854895,-1.916495,-0.33034
1,-0.249483,-0.528534,1.672201,-1.307664
2,2.504183,0.659567,-1.050165,-1.206636
0,2.713183,0.030773,,-1.063719
1,0.885463,0.403601,,-1.242394


![concat.png](files/concat.png)

### 合并重叠数据

我们可能需要索引全部或者部分重叠的两个数据集

In [72]:
a = Series([np.nan, 2., np.nan, 3.5, 4.5, np.nan], index=list("fedcba"))

In [73]:
b = Series(np.arange(len(a), dtype=np.float64), index=list("fedcba"))

In [74]:
b[-1] = np.nan

In [75]:
a

f    NaN
e    2.0
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [76]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

In [77]:
np.where(pd.isnull(a), b, a)

array([ 0. ,  2. ,  2. ,  3.5,  4.5,  nan])

Series有一个combine_first的方法，会进行数据对齐

In [78]:
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

In [79]:
b[:-2]

f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64

In [80]:
a[2:]

d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

对于DataFrame，combine_first也会做同样的事情

In [81]:
df1 = DataFrame({
    "a": [1, np.nan, 5., np.nan],
    "b": [np.nan, 2., np.nan, 6.],
    "c": range(2, 18, 4)
})

In [82]:
df2 = DataFrame({
    "a": [5, 4, np.nan, 3, 7],
    "b": [np.nan, 3, 4, 6, 8]
})

In [83]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


## 重塑和轴向旋转

许多用于重新排列表格型数据的基础运算。

这些函数也称作重塑（reshape）或者轴向旋转（pivot）运算

### 重塑层次化索引

层次化索引为DataFrame数据的重排提供了良好的一致性。

* stack 将数据列“旋转”为行
* unstack 将数据行“旋转”为列

In [85]:
data = DataFrame(np.arange(6).reshape((2,3)),
                index=pd.Index(["Ohio", "Colorado"], name="state"),
                columns=pd.Index(["one", "two", "three"], name="number"))

In [86]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [87]:
result = data.stack()

In [88]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [89]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


默认情况下，stack和unstack操作的最内层。

传递分层的编号或者名称可以对其他级别进行操作

In [90]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [91]:
result.unstack("state")

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


如果不是所有的级别值都能在各个分组找到的话，则unstack操作会引入缺失数据

In [92]:
s1 = Series([0,1,2,3], index=list("abcd"))

In [93]:
s2 = Series([4,5,6], index=list("cde"))

In [94]:
data2 = pd.concat([s1, s2], keys=["one", "two"])

In [95]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


stack默认会滤除缺失数据，因此该运算是可逆的

In [96]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [97]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

### 将“长格式”转换为“宽格式”

In [101]:
import datetime
import random

dates = [datetime.datetime(2017, 4, i) for i in range(1, 11)]
items = ["infl", "realgdp", "unemp"]
N = 30

data = DataFrame({
    "date": dates*3,
    "item": items*10,
    "value": [random.randint(1, 1000) for _ in xrange(N)],
})

In [99]:
data

Unnamed: 0,date,item,value
0,2017-04-02,unemp,284
1,2017-04-06,realgdp,676
2,2017-04-02,realgdp,706
3,2017-04-04,unemp,476
4,2017-04-10,unemp,889
5,2017-04-02,realgdp,194
6,2017-04-10,realgdp,934
7,2017-04-03,unemp,857
8,2017-04-08,realgdp,634
9,2017-04-03,infl,971


这里的数据，可能是从mysql等获取的，date和item构成唯一约束

我们想把不同item值分别形成一列，date列的时间值作为索引

In [102]:
pivoted = data.pivot("date", "item", "value")

In [103]:
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-01,479,236,24
2017-04-02,751,394,709
2017-04-03,534,560,806
2017-04-04,775,167,551
2017-04-05,622,147,703


前两个参数值分别作为行和列索引的名称，最后一个参数填充数据的列名。

这里 (date, item) 要唯一

如果两个参与重塑的数据列

忽略最后一个参数，得到DataFrame带有层次化的列

In [104]:
data["value2"] = np.random.randn(len(data))

In [105]:
data.head()

Unnamed: 0,date,item,value,value2
0,2017-04-01,infl,479,-2.138123
1,2017-04-02,realgdp,394,0.199665
2,2017-04-03,unemp,806,1.12398
3,2017-04-04,infl,775,1.444247
4,2017-04-05,realgdp,147,-0.414044


In [106]:
pivoted = data.pivot("date", "item")

In [107]:
pivoted.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2017-04-01,479,236,24,-2.138123,-0.750376,0.25374
2017-04-02,751,394,709,0.175661,0.199665,0.783841
2017-04-03,534,560,806,0.793723,0.688024,1.12398
2017-04-04,775,167,551,1.444247,0.121607,-0.806402
2017-04-05,622,147,703,0.853979,-0.414044,0.301106


其实pivot只是个快捷方式，用set_index创建层次索引，再用unstack重塑

In [109]:
unstacked = data.set_index(["date", "item"]).unstack("item")

In [110]:
unstacked.head()

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2017-04-01,479,236,24,-2.138123,-0.750376,0.25374
2017-04-02,751,394,709,0.175661,0.199665,0.783841
2017-04-03,534,560,806,0.793723,0.688024,1.12398
2017-04-04,775,167,551,1.444247,0.121607,-0.806402
2017-04-05,622,147,703,0.853979,-0.414044,0.301106


## 数据转换

### 异常重复数据

In [111]:
data = DataFrame({
    "k1": ["one"]*3 + ["two"]*4,
    "k2": [1,1,2,3,3,4,4]
})

In [112]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


DataFrame的duplicate方法返回一个布尔型Series，表示各行是否是重复行

In [113]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

drop_duplicates方法，返回一个移除重复行的DataFrame

In [114]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


这两个方法默认会判断全部列，也可以指定部分列进行重复项判断

比如执行根据k1列过滤重复

In [115]:
data["v1"] = range(7)

In [116]:
data.drop_duplicates(["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


默认保留的是第一个出现的值组合，可以传入keep="last"保留最后一个

In [118]:
data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


### 利用函数或者映射进行数据转换

In [120]:
data = DataFrame({
    "name": ["xiaoqing", "xiaoming", "xiaohong"],
    "age": [21, 22, 23],
})

In [121]:
data

Unnamed: 0,age,name
0,21,xiaoqing
1,22,xiaoming
2,23,xiaohong


我们想添加一列表示他们来自哪个班级

In [122]:
name_to_class = {"xiaoqing": "class1", "xiaoming": "class1", "xiaohong": "class2"}

Series的map方法接受一个函数或者含有映射关系的字典对象

In [125]:
data["class"] = data["name"].map(name_to_class)

In [126]:
data

Unnamed: 0,age,name,class
0,21,xiaoqing,class1
1,22,xiaoming,class1
2,23,xiaohong,class2


### 替换值

replace可以进行值的替换

In [127]:
data = Series([1, -999,2,-999,-1000,3])

In [128]:
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

替换-999为缺失值

In [129]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

也可以一次性替换多个值，传入一个由待替换值组成的列表或者衣蛾替换值

In [130]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [131]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

也可以传递字典

In [132]:
data.replace({-999: np.nan, -1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 重命名轴索引

跟Series中的值一样，轴标签也可以通过函数或者映射进行转换，从而得到一个新对象。

In [133]:
data = DataFrame(np.arange(12).reshape((3,4)), index=["Ohio", "Colorado", "New York"], columns=["one", "two", "three", "four"])

跟Series一样，轴标签也有一个map方法：

In [134]:
data.index.map(str.upper)

array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)

In [135]:
data.index = data.index.map(str.upper)

In [136]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


想创建数据集的转换版，而不是修改原始数据，实用的方法是rename

In [137]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


rename可以结合字典型对象实现对部分轴标签的更新

In [139]:
data.rename(index={"OHIO": "INDIANA"}, columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


如果想修改数据集，传入inplace=True

In [140]:
_ = data.rename(index={"OHIO": "INDIANA"});data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


### 离散化和面元划分

假设有一组人员数据，你希望划分为不同的年龄组，可以使用pandas的cut函数

In [141]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [142]:
bins = [18, 25, 35, 60, 100]

In [143]:
cats = pd.cut(ages, bins)

In [144]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [146]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [149]:
pd.value_counts(cats)  # 统计年龄组数量

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

默认是左边是开区间，右边是闭区间；可以通过right=False来修改为右边是开区间，左边是闭区间

In [152]:
pd.cut(ages, [18,26,36, 61,100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

还可以设置自己的面元名称，将labels选项设置为一个列表即可

In [153]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

In [154]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

如果cut传入的是面元数量，则根据最大值和最小值计算等长面元

In [155]:
data = np.random.rand(20)

In [157]:
pd.cut(data, 4, precision=2)

[(0.032, 0.27], (0.032, 0.27], (0.032, 0.27], (0.032, 0.27], (0.27, 0.5], ..., (0.5, 0.74], (0.74, 0.97], (0.5, 0.74], (0.032, 0.27], (0.74, 0.97]]
Length: 20
Categories (4, object): [(0.032, 0.27] < (0.27, 0.5] < (0.5, 0.74] < (0.74, 0.97]]

qcut是类似于cut的函数，根据样本分位数对数据进行面元划分。

In [158]:
data = np.random.randn(1000)  # 正态分布

In [159]:
cats = pd.qcut(data, 4)  # 按四分位数进行切割

In [160]:
cats

[(-0.627, 0.0275], [-2.952, -0.627], (-0.627, 0.0275], [-2.952, -0.627], [-2.952, -0.627], ..., (0.661, 2.824], [-2.952, -0.627], (0.661, 2.824], (0.661, 2.824], (0.661, 2.824]]
Length: 1000
Categories (4, object): [[-2.952, -0.627] < (-0.627, 0.0275] < (0.0275, 0.661] < (0.661, 2.824]]

也可以自定义分位数（0到1之间的数值，包含端点）

In [161]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(-1.19, 0.0275], (-1.19, 0.0275], (-1.19, 0.0275], (-1.19, 0.0275], (-1.19, 0.0275], ..., (0.0275, 1.299], (-1.19, 0.0275], (1.299, 2.824], (0.0275, 1.299], (1.299, 2.824]]
Length: 1000
Categories (4, object): [[-2.952, -1.19] < (-1.19, 0.0275] < (0.0275, 1.299] < (1.299, 2.824]]

### 排列和随机采样

numpy.random.permutation函数可以完成对Series或者DataFrame的列排列工作

In [162]:
df = DataFrame(np.arange(5*4).reshape(5, 4))

In [163]:
sampler = np.random.permutation(5)

In [164]:
sampler

array([1, 3, 0, 2, 4])

In [165]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [166]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3
2,8,9,10,11
4,16,17,18,19


想通过替换的方式产生样本，最快的方式是通过np.random.ranint来得到一组随机整数

In [167]:
bag = np.array([5,7,-1,6,4])

In [169]:
sampler = np.random.randint(0, len(bag), size=10)

In [170]:
sampler

array([0, 3, 1, 4, 4, 0, 2, 4, 1, 3])

In [171]:
draws = bag.take(sampler)

In [172]:
draws

array([ 5,  6,  7,  4,  4,  5, -1,  4,  7,  6])

### 计算指标/哑变量

将分类变量，转换为指标矩阵。

pandas 有个get_dummies函数可以实现

In [173]:
df = DataFrame({
    "key": ["b", "b", "a", "c", "a", "b"],
    "data1": range(6)
})

In [174]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


get_dumies的perfix可以给指标DataFrame的列加上前缀

In [175]:
dummies = pd.get_dummies(df["key"], prefix="key")

In [176]:
df_with_dummy = df[["data1"]].join(dummies)

In [177]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


对于统计应用有用的秘诀是：结合get_dummies和cut之类的离散化函数

In [178]:
values = np.random.rand(10)

In [179]:
values

array([ 0.27332283,  0.33512335,  0.77108225,  0.57949188,  0.44184113,
        0.36938862,  0.83875624,  0.98923497,  0.1063087 ,  0.44016772])

In [181]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [182]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1]"
0,0,1,0,0,0
1,0,1,0,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,0,0,1,0,0
5,0,1,0,0,0
6,0,0,0,0,1
7,0,0,0,0,1
8,1,0,0,0,0
9,0,0,1,0,0


## 字符串操作

### pandas中矢量化的字符串函数

In [185]:
data = {
    "Dava": "dava@google.com",
    "Steve": "steve@gmail.com",
    "Rob": "rob@gmail.com",
    "Wes": np.nan
}

In [186]:
data = Series(data);data

Dava     dava@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

通过data.map 所有的字符串和正则表达式方法都会被应用于各个值，但是存在NA就会报错

Series有个跳过NA值的字符串操作方法，通过Series的str属性访问这些方法。

In [187]:
data.str.contains("gmail")

Dava     False
Rob       True
Steve     True
Wes        NaN
dtype: object

In [196]:
import re
pattern = "([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})"

In [197]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dava     [(dava, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

In [203]:
res = data.str.match(pattern, flags=re.IGNORECASE)

  """Entry point for launching an IPython kernel.


In [204]:
res.str.get(1)

Dava     google
Rob       gmail
Steve     gmail
Wes         NaN
dtype: object

In [205]:
res.str[0]

Dava      dava
Rob        rob
Steve    steve
Wes        NaN
dtype: object

In [206]:
data.str[:5]

Dava     dava@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object