In [1]:
import pandas as pd
import numpy as np

## 重新索引

### Series

In [2]:
s = pd.Series([1, 3, 5, 6, 8], index=list('acefh'))
s

a    1
c    3
e    5
f    6
h    8
dtype: int64

In [3]:
s.reindex(list('abcdefgh'))

a     1
b   NaN
c     3
d   NaN
e     5
f     6
g   NaN
h     8
dtype: float64

In [4]:
s.reindex(list('abcdefgh'), fill_value=0)

a    1
b    0
c    3
d    0
e    5
f    6
g    0
h    8
dtype: int64

In [5]:
# method='bfill'
s.reindex(list('abcdefgh'), method='ffill')

a    1
b    1
c    3
d    3
e    5
f    6
g    6
h    8
dtype: int64

### DataFrame

In [6]:
df = pd.DataFrame(np.random.randn(4, 6), index=list('ADFH'), columns=['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,one,two,three,four,five,six
A,0.842797,1.386367,0.561551,-0.318504,-0.010161,-0.474195
D,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
F,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
H,0.394752,0.894697,0.5743,-0.878805,-1.443706,0.358006


In [7]:
df2 = df.reindex(index=list('ABCDEFGH'))
df2

Unnamed: 0,one,two,three,four,five,six
A,0.842797,1.386367,0.561551,-0.318504,-0.010161,-0.474195
B,,,,,,
C,,,,,,
D,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
E,,,,,,
F,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
G,,,,,,
H,0.394752,0.894697,0.5743,-0.878805,-1.443706,0.358006


In [8]:
df.loc['A']['one'] = 100
df

Unnamed: 0,one,two,three,four,five,six
A,100.0,1.386367,0.561551,-0.318504,-0.010161,-0.474195
D,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
F,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
H,0.394752,0.894697,0.5743,-0.878805,-1.443706,0.358006


In [9]:
df2

Unnamed: 0,one,two,three,four,five,six
A,0.842797,1.386367,0.561551,-0.318504,-0.010161,-0.474195
B,,,,,,
C,,,,,,
D,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
E,,,,,,
F,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
G,,,,,,
H,0.394752,0.894697,0.5743,-0.878805,-1.443706,0.358006


In [10]:
df.reindex(columns=['one', 'three', 'five', 'seven'])

Unnamed: 0,one,three,five,seven
A,100.0,0.561551,-0.010161,
D,0.50461,-0.651173,-0.114761,
F,-0.427405,-0.48127,-0.369239,
H,0.394752,0.5743,-1.443706,


In [11]:
df.reindex(columns=['one', 'three', 'five', 'seven'], fill_value=0)

Unnamed: 0,one,three,five,seven
A,100.0,0.561551,-0.010161,0
D,0.50461,-0.651173,-0.114761,0
F,-0.427405,-0.48127,-0.369239,0
H,0.394752,0.5743,-1.443706,0


In [12]:
# fill method 只对行有效
df.reindex(columns=['one', 'three', 'five', 'seven'], method='ffill')

Unnamed: 0,one,three,five,seven
A,100.0,0.561551,-0.010161,
D,0.50461,-0.651173,-0.114761,
F,-0.427405,-0.48127,-0.369239,
H,0.394752,0.5743,-1.443706,


In [13]:
df.reindex(index=list('ABCDEFGH'), method='ffill')

Unnamed: 0,one,two,three,four,five,six
A,100.0,1.386367,0.561551,-0.318504,-0.010161,-0.474195
B,100.0,1.386367,0.561551,-0.318504,-0.010161,-0.474195
C,100.0,1.386367,0.561551,-0.318504,-0.010161,-0.474195
D,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
E,0.50461,-1.89457,-0.651173,0.774544,-0.114761,0.536593
F,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
G,-0.427405,0.535483,-0.48127,0.554401,-0.369239,-0.757361
H,0.394752,0.894697,0.5743,-0.878805,-1.443706,0.358006


## 丢弃部分数据

In [14]:
df = pd.DataFrame(np.random.randn(4, 6), index=list('ABCD'), columns=['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,one,two,three,four,five,six
A,-1.407741,-1.282737,0.822116,-0.868931,-0.094459,-2.189066
B,0.87231,-1.0002,0.065498,1.901136,-1.043616,1.201832
C,0.311036,0.050273,0.048505,-0.814031,0.364601,0.075633
D,0.980987,0.644028,0.6401,-1.102679,-0.799969,-0.022269


In [15]:
df.drop('A')

Unnamed: 0,one,two,three,four,five,six
B,0.87231,-1.0002,0.065498,1.901136,-1.043616,1.201832
C,0.311036,0.050273,0.048505,-0.814031,0.364601,0.075633
D,0.980987,0.644028,0.6401,-1.102679,-0.799969,-0.022269


In [16]:
df2 = df.drop(['two', 'four'], axis=1)
df2

Unnamed: 0,one,three,five,six
A,-1.407741,0.822116,-0.094459,-2.189066
B,0.87231,0.065498,-1.043616,1.201832
C,0.311036,0.048505,0.364601,0.075633
D,0.980987,0.6401,-0.799969,-0.022269


In [17]:
# 数据拷贝
df.iloc[0, 0] = 100
df

Unnamed: 0,one,two,three,four,five,six
A,100.0,-1.282737,0.822116,-0.868931,-0.094459,-2.189066
B,0.87231,-1.0002,0.065498,1.901136,-1.043616,1.201832
C,0.311036,0.050273,0.048505,-0.814031,0.364601,0.075633
D,0.980987,0.644028,0.6401,-1.102679,-0.799969,-0.022269


In [18]:
df2

Unnamed: 0,one,three,five,six
A,-1.407741,0.822116,-0.094459,-2.189066
B,0.87231,0.065498,-1.043616,1.201832
C,0.311036,0.048505,0.364601,0.075633
D,0.980987,0.6401,-0.799969,-0.022269


### 广播运算

In [19]:
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [20]:
df.loc['one']

A    0
B    1
C    2
Name: one, dtype: int32

In [21]:
df - df.loc['one']

Unnamed: 0,A,B,C
one,0,0,0
two,3,3,3
three,6,6,6
four,9,9,9


### 函数应用

* apply: 将数据按行或列进行计算
* applymap: 将数据按元素为进行计算

In [22]:
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [23]:
# 每一列作为一个 Series 作为参数传递给 lambda 函数
df.apply(lambda x: x.max() - x.min())

A    9
B    9
C    9
dtype: int64

In [24]:
# 每一行作为一个 Series 作为参数传递给 lambda 函数
df.apply(lambda x: x.max() - x.min(), axis=1)

one      2
two      2
three    2
four     2
dtype: int64

In [25]:
# 返回多个值组成的 Series
def min_max(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
df.apply(min_max, axis=1)

Unnamed: 0,min,max
one,0,2
two,3,5
three,6,8
four,9,11


In [26]:
# applymap: 逐元素运算
df = pd.DataFrame(np.random.randn(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,-1.283572,-0.088667,0.64425
two,0.57784,-0.327568,0.608721
three,-1.689572,-1.364415,0.103483
four,0.060262,-1.896395,0.165161


In [27]:
formater = '{0:.02f}'.format
# formater = lambda x: '%.02f' % x
df.applymap(formater)

Unnamed: 0,A,B,C
one,-1.28,-0.09,0.64
two,0.58,-0.33,0.61
three,-1.69,-1.36,0.1
four,0.06,-1.9,0.17


### 排序和排名

In [28]:
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)), index=list('ABCD'), columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
A,7,2,3
B,4,9,1
C,3,4,4
D,6,2,4


In [29]:
df.sort_values(by='one')

Unnamed: 0,one,two,three
C,3,4,4
B,4,9,1
D,6,2,4
A,7,2,3


In [30]:
s = pd.Series([3, 6, 2, 6, 4])

In [31]:
s.rank()

0    2.0
1    4.5
2    1.0
3    4.5
4    3.0
dtype: float64

In [32]:
s.rank(method='first', ascending=False)

0    4
1    1
2    5
3    2
4    3
dtype: float64

### 数据唯一性及成员资格

适用于 Series

In [33]:
s = pd.Series(list('abbcdabacad'))
s

0     a
1     b
2     b
3     c
4     d
5     a
6     b
7     a
8     c
9     a
10    d
dtype: object

In [34]:
s.unique()

array(['a', 'b', 'c', 'd'], dtype=object)

In [35]:
s.value_counts()

a    4
b    3
d    2
c    2
dtype: int64

In [36]:
s.isin(['a', 'b', 'c'])

0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8      True
9      True
10    False
dtype: bool

### 层次化索引

可以使数据在一个轴上有多个索引级别。即可以用二维的数据表达更高维度的数据，使数据组织方式更清晰。它使用 pd.MultiIndex 类来表示。

#### 层次化索引有什么作用？

比如我们在分析股票数据，我们的一级行索引可以是日期；二级行索引可以是股票代码，列索引可以是股票的交易量，开盘价，收盘价等等。这样我们就可以把多个股票放在同一个时间维度下进行考察和分析。

#### Series 多层索引

In [37]:
a = [['a', 'a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 3, 1, 2, 2, 3]]
tuples = list(zip(*a))
tuples

[('a', 1), ('a', 2), ('a', 3), ('b', 1), ('b', 2), ('c', 2), ('c', 3)]

In [38]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[[u'a', u'b', u'c'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2], [0, 1, 2, 0, 1, 1, 2]],
           names=[u'first', u'second'])

In [39]:
s = pd.Series(np.random.randn(7), index=index)
s

first  second
a      1        -0.825823
       2         0.878052
       3        -0.737150
b      1        -0.052367
       2         1.161518
c      2        -1.882330
       3        -1.170602
dtype: float64

In [40]:
s.index

MultiIndex(levels=[[u'a', u'b', u'c'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2], [0, 1, 2, 0, 1, 1, 2]],
           names=[u'first', u'second'])

In [41]:
s.index.levels[0]

Index([u'a', u'b', u'c'], dtype='object', name=u'first')

In [42]:
s.index.levels[1]

Int64Index([1, 2, 3], dtype='int64', name=u'second')

In [43]:
s['b']

second
1   -0.052367
2    1.161518
dtype: float64

In [44]:
s['b':'c']

first  second
b      1        -0.052367
       2         1.161518
c      2        -1.882330
       3        -1.170602
dtype: float64

In [45]:
s[['b', 'a']]

first  second
b      1        -0.052367
       2         1.161518
a      1        -0.825823
       2         0.878052
       3        -0.737150
dtype: float64

In [46]:
s['b', 1]

-0.052366974254005647

In [47]:
s[:, 2]

first
a    0.878052
b    1.161518
c   -1.882330
dtype: float64

#### DataFrame 多层索引

In [48]:
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)), 
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=[['one', 'one', 'two'], ['blue', 'red', 'blue']])
df.index.names = ['row-1', 'row-2']
df.columns.names = ['col-1', 'col-2']
df

Unnamed: 0_level_0,col-1,one,one,two
Unnamed: 0_level_1,col-2,blue,red,blue
row-1,row-2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,5,2,6
a,2,4,7,4
b,1,8,1,4
b,2,8,6,7


In [49]:
df.loc['a']

col-1,one,one,two
col-2,blue,red,blue
row-2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,5,2,6
2,4,7,4


In [50]:
type(df.loc['a'])

pandas.core.frame.DataFrame

In [51]:
df.loc['a', 1]

col-1  col-2
one    blue     5
       red      2
two    blue     6
Name: (a, 1), dtype: int32

In [52]:
df.loc['a', 1].index

MultiIndex(levels=[[u'one', u'two'], [u'blue', u'red']],
           labels=[[0, 0, 1], [0, 1, 0]],
           names=[u'col-1', u'col-2'])

#### 索引交换及排序

In [53]:
df

Unnamed: 0_level_0,col-1,one,one,two
Unnamed: 0_level_1,col-2,blue,red,blue
row-1,row-2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,5,2,6
a,2,4,7,4
b,1,8,1,4
b,2,8,6,7


In [54]:
df2 = df.swaplevel('row-1', 'row-2')
df2

Unnamed: 0_level_0,col-1,one,one,two
Unnamed: 0_level_1,col-2,blue,red,blue
row-2,row-1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,5,2,6
2,a,4,7,4
1,b,8,1,4
2,b,8,6,7


In [55]:
df2.sortlevel(0)

Unnamed: 0_level_0,col-1,one,one,two
Unnamed: 0_level_1,col-2,blue,red,blue
row-2,row-1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,5,2,6
1,b,8,1,4
2,a,4,7,4
2,b,8,6,7


#### 按照索引级别进行统计

In [56]:
df

Unnamed: 0_level_0,col-1,one,one,two
Unnamed: 0_level_1,col-2,blue,red,blue
row-1,row-2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,5,2,6
a,2,4,7,4
b,1,8,1,4
b,2,8,6,7


In [57]:
df.sum(level=0)

col-1,one,one,two
col-2,blue,red,blue
row-1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,9,9,10
b,16,7,11


In [58]:
df.sum(level=1)

col-1,one,one,two
col-2,blue,red,blue
row-2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,13,3,10
2,12,13,11


In [59]:
df2.sum(level=0)

col-1,one,one,two
col-2,blue,red,blue
row-2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,13,3,10
2,12,13,11


#### 索引与列的转换

In [60]:
df = pd.DataFrame({
        'a': range(7),
        'b': range(7, 0, -1),
        'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
        'd': [0, 1, 2, 0, 1, 2, 3]
    })
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [61]:
df.set_index('c')

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,7,0
one,1,6,1
one,2,5,2
two,3,4,0
two,4,3,1
two,5,2,2
two,6,1,3


In [62]:
df2 = df.set_index(['c', 'd'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [63]:
df3 = df2.reset_index().sort_index('columns')
df3

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [64]:
(df3 == df)

Unnamed: 0,a,b,c,d
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,True,True
4,True,True,True,True
5,True,True,True,True
6,True,True,True,True
