In [1]:
import pandas as pd
import numpy as np

## 重新索引

### Series

In [2]:
s = pd.Series([1, 3, 5, 6, 8], index=list('acefh'))
s

a    1
c    3
e    5
f    6
h    8
dtype: int64

In [3]:
s.reindex(list('abcdefgh'))

a     1
b   NaN
c     3
d   NaN
e     5
f     6
g   NaN
h     8
dtype: float64

In [4]:
s.reindex(list('abcdefgh'), fill_value=0)

a    1
b    0
c    3
d    0
e    5
f    6
g    0
h    8
dtype: int64

In [5]:
# method='bfill'
s.reindex(list('abcdefgh'), method='ffill')

a    1
b    1
c    3
d    3
e    5
f    6
g    6
h    8
dtype: int64

### DataFrame

In [6]:
df = pd.DataFrame(np.random.randn(4, 6), index=list('ADFH'), columns=['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,one,two,three,four,five,six
A,0.63758,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
D,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
F,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
H,0.863899,0.745047,1.265161,-0.687255,-1.685287,-0.865689


In [7]:
df2 = df.reindex(index=list('ABCDEFGH'))
df2

Unnamed: 0,one,two,three,four,five,six
A,0.63758,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
B,,,,,,
C,,,,,,
D,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
E,,,,,,
F,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
G,,,,,,
H,0.863899,0.745047,1.265161,-0.687255,-1.685287,-0.865689


In [8]:
df.loc['A']['one'] = 100
df

Unnamed: 0,one,two,three,four,five,six
A,100.0,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
D,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
F,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
H,0.863899,0.745047,1.265161,-0.687255,-1.685287,-0.865689


In [9]:
df2

Unnamed: 0,one,two,three,four,five,six
A,0.63758,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
B,,,,,,
C,,,,,,
D,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
E,,,,,,
F,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
G,,,,,,
H,0.863899,0.745047,1.265161,-0.687255,-1.685287,-0.865689


In [10]:
df.reindex(columns=['one', 'three', 'five', 'seven'])

Unnamed: 0,one,three,five,seven
A,100.0,-3.52165,-0.582373,
D,0.439974,-0.585681,-0.82609,
F,1.527146,1.549037,-0.358562,
H,0.863899,1.265161,-1.685287,


In [11]:
df.reindex(columns=['one', 'three', 'five', 'seven'], fill_value=0)

Unnamed: 0,one,three,five,seven
A,100.0,-3.52165,-0.582373,0
D,0.439974,-0.585681,-0.82609,0
F,1.527146,1.549037,-0.358562,0
H,0.863899,1.265161,-1.685287,0


In [12]:
# fill method 只对行有效
df.reindex(columns=['one', 'three', 'five', 'seven'], method='ffill')

Unnamed: 0,one,three,five,seven
A,100.0,-3.52165,-0.582373,
D,0.439974,-0.585681,-0.82609,
F,1.527146,1.549037,-0.358562,
H,0.863899,1.265161,-1.685287,


In [13]:
df.reindex(index=list('ABCDEFGH'), method='ffill')

Unnamed: 0,one,two,three,four,five,six
A,100.0,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
B,100.0,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
C,100.0,-0.581579,-3.52165,1.142838,-0.582373,-1.288547
D,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
E,0.439974,0.239836,-0.585681,-1.884398,-0.82609,0.793912
F,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
G,1.527146,-1.207567,1.549037,-1.041112,-0.358562,0.445003
H,0.863899,0.745047,1.265161,-0.687255,-1.685287,-0.865689


## 丢弃部分数据

In [14]:
df = pd.DataFrame(np.random.randn(4, 6), index=list('ABCD'), columns=['one', 'two', 'three', 'four', 'five', 'six'])
df

Unnamed: 0,one,two,three,four,five,six
A,0.205215,-1.509349,0.966297,0.470838,-0.290046,0.789102
B,2.179511,-0.392384,0.667607,0.554482,0.170943,0.432522
C,0.205824,0.18827,-0.15825,-0.797468,-0.25289,0.389529
D,1.112086,-0.398583,-0.90056,0.597893,0.701035,-1.215749


In [15]:
df.drop('A')

Unnamed: 0,one,two,three,four,five,six
B,2.179511,-0.392384,0.667607,0.554482,0.170943,0.432522
C,0.205824,0.18827,-0.15825,-0.797468,-0.25289,0.389529
D,1.112086,-0.398583,-0.90056,0.597893,0.701035,-1.215749


In [16]:
df2 = df.drop(['two', 'four'], axis=1)
df2

Unnamed: 0,one,three,five,six
A,0.205215,0.966297,-0.290046,0.789102
B,2.179511,0.667607,0.170943,0.432522
C,0.205824,-0.15825,-0.25289,0.389529
D,1.112086,-0.90056,0.701035,-1.215749


In [17]:
# 数据拷贝
df.iloc[0, 0] = 100
df

Unnamed: 0,one,two,three,four,five,six
A,100.0,-1.509349,0.966297,0.470838,-0.290046,0.789102
B,2.179511,-0.392384,0.667607,0.554482,0.170943,0.432522
C,0.205824,0.18827,-0.15825,-0.797468,-0.25289,0.389529
D,1.112086,-0.398583,-0.90056,0.597893,0.701035,-1.215749


In [18]:
df2

Unnamed: 0,one,three,five,six
A,0.205215,0.966297,-0.290046,0.789102
B,2.179511,0.667607,0.170943,0.432522
C,0.205824,-0.15825,-0.25289,0.389529
D,1.112086,-0.90056,0.701035,-1.215749


### 广播运算

In [19]:
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [20]:
df.loc['one']

A    0
B    1
C    2
Name: one, dtype: int32

In [21]:
df - df.loc['one']

Unnamed: 0,A,B,C
one,0,0,0
two,3,3,3
three,6,6,6
four,9,9,9


### 函数应用

* apply: 将数据按行或列进行计算
* applymap: 将数据按元素为进行计算

In [22]:
df = pd.DataFrame(np.arange(12).reshape(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [23]:
# 每一列作为一个 Series 作为参数传递给 lambda 函数
df.apply(lambda x: x.max() - x.min())

A    9
B    9
C    9
dtype: int64

In [24]:
# 每一行作为一个 Series 作为参数传递给 lambda 函数
df.apply(lambda x: x.max() - x.min(), axis=1)

one      2
two      2
three    2
four     2
dtype: int64

In [25]:
# 返回多个值组成的 Series
def min_max(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
df.apply(min_max, axis=1)

Unnamed: 0,min,max
one,0,2
two,3,5
three,6,8
four,9,11


In [26]:
# applymap: 逐元素运算
df = pd.DataFrame(np.random.randn(4, 3), index=['one', 'two', 'three', 'four'], columns=list('ABC'))
df

Unnamed: 0,A,B,C
one,1.650032,-0.510633,0.048831
two,-2.48777,-0.395993,0.381618
three,-0.898265,0.358054,0.600669
four,1.231078,0.730939,0.311994


In [27]:
formater = '{0:.02f}'.format
# formater = lambda x: '%.02f' % x
df.applymap(formater)

Unnamed: 0,A,B,C
one,1.65,-0.51,0.05
two,-2.49,-0.4,0.38
three,-0.9,0.36,0.6
four,1.23,0.73,0.31


### 排序和排名

In [28]:
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)), index=list('ABCD'), columns=['one', 'two', 'three'])
df

Unnamed: 0,one,two,three
A,6,5,7
B,4,6,1
C,4,5,8
D,5,7,2


In [29]:
df.sort_values(by='one')

Unnamed: 0,one,two,three
B,4,6,1
C,4,5,8
D,5,7,2
A,6,5,7


In [30]:
s = pd.Series([3, 6, 2, 6, 4])

In [31]:
s.rank()

0    2.0
1    4.5
2    1.0
3    4.5
4    3.0
dtype: float64

In [32]:
s.rank(method='first', ascending=False)

0    4
1    1
2    5
3    2
4    3
dtype: float64

### 数据唯一性及成员资格

适用于 Series

In [47]:
s = pd.Series(list('abbcdabacad'))
s

0     a
1     b
2     b
3     c
4     d
5     a
6     b
7     a
8     c
9     a
10    d
dtype: object

In [48]:
s.unique()

array(['a', 'b', 'c', 'd'], dtype=object)

In [49]:
s.value_counts()

a    4
b    3
d    2
c    2
dtype: int64

In [51]:
s.isin(['a', 'b', 'c'])

0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8      True
9      True
10    False
dtype: bool

### 层次化索引

可以使数据在一个轴上有多个索引级别。即可以用二维的数据表达更高维度的数据，使数据组织方式更清晰。它使用 pd.MultiIndex 类来表示。

#### Series 多层索引

In [63]:
s = pd.Series(np.random.randn(7), index=[['a', 'a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 3, 1, 2, 2, 3]])
s

a  1   -1.409351
   2    1.050019
   3    0.267683
b  1    0.846219
   2   -0.136538
c  2    0.235091
   3    1.678844
dtype: float64

In [64]:
s.index

MultiIndex(levels=[[u'a', u'b', u'c'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2], [0, 1, 2, 0, 1, 1, 2]])

In [69]:
s.index.levels[0]

Index([u'a', u'b', u'c'], dtype='object')

In [70]:
s.index.levels[1]

Int64Index([1, 2, 3], dtype='int64')

In [71]:
s['b']

1    0.846219
2   -0.136538
dtype: float64

In [72]:
s['b':'c']

b  1    0.846219
   2   -0.136538
c  2    0.235091
   3    1.678844
dtype: float64

In [73]:
s[['b', 'a']]

b  1    0.846219
   2   -0.136538
a  1   -1.409351
   2    1.050019
   3    0.267683
dtype: float64

In [74]:
s['b', 1]

0.84621865759428661

In [75]:
s[:, 2]

a    1.050019
b   -0.136538
c    0.235091
dtype: float64

#### DataFrame 多层索引

In [87]:
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)), 
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], 
                  columns=[['one', 'one', 'two'], ['blue', 'red', 'blue']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,one,one,two
Unnamed: 0_level_1,Unnamed: 1_level_1,blue,red,blue
a,1,6,3,9
a,2,2,4,9
b,1,2,7,6
b,2,2,8,1


In [88]:
df.loc['a']

Unnamed: 0_level_0,one,one,two
Unnamed: 0_level_1,blue,red,blue
1,6,3,9
2,2,4,9


In [89]:
type(df.loc['a'])

pandas.core.frame.DataFrame

In [90]:
df.loc['a', 1]

one  blue    6
     red     3
two  blue    9
Name: (a, 1), dtype: int32

In [91]:
df.loc['a', 1].index

MultiIndex(levels=[[u'one', u'two'], [u'blue', u'red']],
           labels=[[0, 0, 1], [0, 1, 0]])