# Getting Started with pandas

In [1]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

## Introduction to pandas Data Structures
### Series

In [2]:
obj = Series([1,2,3])
obj

0    1
1    2
2    3
dtype: int64

In [3]:
obj.values

array([1, 2, 3])

In [4]:
obj.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
obj2 = Series([1,2,3], index=['a','b','c'])
obj2

a    1
b    2
c    3
dtype: int64

In [6]:
obj2['b'] = 6
obj2

a    1
b    6
c    3
dtype: int64

In [7]:
obj2[obj2 > 3]

b    6
dtype: int64

In [8]:
obj2 * 2

a     2
b    12
c     6
dtype: int64

In [9]:
'b' in obj2

True

In [10]:
'd' in obj2

False

In [11]:
# 透过字典建立 Series（Series的索引是字典的键）
dict = {'apple': 1, 'banana': 2, 'cherry': 3}
obj3 = Series(dict)
obj3

apple     1
banana    2
cherry    3
dtype: int64

In [12]:
# 与fruits索引匹配的值会被找出放到相对应的位置
fruits = ['cherry', 'banana', 'apple', 'date']
obj4 = Series(dict, fruits)
obj4

cherry    3.0
banana    2.0
apple     1.0
date      NaN
dtype: float64

In [13]:
pd.isnull(obj4)

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [14]:
pd.notnull(obj4)

cherry     True
banana     True
apple      True
date      False
dtype: bool

In [15]:
obj4.isnull()

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [16]:
obj4.index.name = 'fruit'
obj4.name = "myName"
obj4

fruit
cherry    3.0
banana    2.0
apple     1.0
date      NaN
Name: myName, dtype: float64

In [17]:
# 对index赋值
obj.index = ['A','B','C']
obj

A    1
B    2
C    3
dtype: int64

### DataFrame

In [18]:
data = {
    'value1': ['A','B','C','D','E'],
    'value2': [1,2,3,4,5],
    'value3': [1.1,2.2,3.3,4.4,5.5]
}

frame = DataFrame(data)
frame

Unnamed: 0,value1,value2,value3
0,A,1,1.1
1,B,2,2.2
2,C,3,3.3
3,D,4,4.4
4,E,5,5.5


In [19]:
# 指定顺序
DataFrame(data, columns=['value2','value3','value1'])

Unnamed: 0,value2,value3,value1
0,1,1.1,A
1,2,2.2,B
2,3,3.3,C
3,4,4.4,D
4,5,5.5,E


In [20]:
# 若传入colomn在数据中找不到，会产生NA
frame2 = DataFrame(data,
                   columns=['value3','value2','value1','value0'],
                   index=['one','two','three','four','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,
two,2.2,2,B,
three,3.3,3,C,
four,4.4,4,D,
five,5.5,5,E,


In [21]:
# 透过字典标记方式存取
# 返回的Series跟DataFrame有相同的索引
frame2['value2']

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [22]:
# 透过属性方式存取
frame2.value2

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [23]:
# row也可以通过位置或名称的方式获取
frame2.ix['three']

value3    3.3
value2      3
value1      C
value0    NaN
Name: three, dtype: object

In [24]:
# column可以通过赋值的方式进行修改
frame2['value0'] = np.arange(5,0,-1)
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,5
two,2.2,2,B,4
three,3.3,3,C,3
four,4.4,4,D,2
five,5.5,5,E,1


In [25]:
# 将列表或数组赋值给某column，长度要跟DataFrame的长度匹配
frame2['value0'] = [1,2,3,4,5]
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1
two,2.2,2,B,2
three,3.3,3,C,3
four,4.4,4,D,4
five,5.5,5,E,5


In [26]:
frame2['value0'] = np.array(['a','b','c','d','e'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,a
two,2.2,2,B,b
three,3.3,3,C,c
four,4.4,4,D,d
five,5.5,5,E,e


In [27]:
# 如果赋值的是Series，会精确匹配DataFrame的索引，空位会被填上缺失值
frame2['value0'] = Series([1,3,5], index=["one",'three','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [28]:
# 为不存在的column赋值，会创建出新的column
frame2['value4'] = frame2.value2 > 3
frame2

Unnamed: 0,value3,value2,value1,value0,value4
one,1.1,1,A,1.0,False
two,2.2,2,B,,False
three,3.3,3,C,3.0,False
four,4.4,4,D,,True
five,5.5,5,E,5.0,True


In [29]:
# del 用于删除 column
del frame2['value4']
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [30]:
# 将字典传给DataFrame: 外层字典的键为 column 索引, 内层的健位 row 索引
data = {
    'c1': {'r1': 1, 'r2': 2, 'r3': 3},
    'c2': {'r1': 'A', 'r2': 'B', 'r3': 'C'}
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


In [31]:
DataFrame(data, index=['r0','r1','r2'])

Unnamed: 0,c1,c2
r0,,
r1,1.0,A
r2,2.0,B


In [32]:
# 由Series组成的字典
data = {
    'c1': Series([1,2,3], index=['r1','r2','r3']),
    'c2': Series(['A','B','C'], index=['r1','r2','r3'])
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


可以传给DataFrame构造器的数据

类型 | 说明
---|---
2D ndarray | A matrix of data, passing optional row and column labels
dict of arrays, lists, or tuples | Each sequence becomes a column in the DataFrame. All sequences must be the same length.
NumPy structured/record array | Treated as the “dict of arrays” case
dict of Series | Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
dict of dicts | Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
list of dicts or Series | Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
List of lists or tuples | Treated as the “2D ndarray” case
Another DataFrame | The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray | Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result

In [33]:
# 2D ndarray
data = [[1,4],[2,5],[3,6]]
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [34]:
# dict of lists
data = {
    'c1': [1,2,3],
    'c2': [4,5,6]}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [35]:
# dict of arrays
data = {
    'c1': np.array([1,2,3]),
    'c2': np.array([4,5,6])}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [36]:
# dict of tuples
data = {
    'c1': (1,2,3),
    'c2': (4,5,6)}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


### Index Objects

In [37]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index

Index([u'a', u'b', u'c'], dtype='object')

In [38]:
# index 对象是不可修改的
# index[1] = 1

Pandas 中主要的 index 对象

类 | 说明
---|---
`Index` | 最泛化的Index对象，将轴标签表示为一个由Python对象组成的NumPy数组
`Int64Index` | 针对整数的特殊Index
`MultiIndex` | 层次化索引对象，表示单个轴上的多层索引
`DatetimeIndex` | 储存纳秒级时间戳
`PeriodIndex` | 针对Period数据的特殊Index

In [39]:
# 轴0层次化索引
index = pd.MultiIndex(levels=[['Up','Down'], ['X','Y']],
                      labels=[[0,0,1,1], [0,1,0,1]])

# 轴1层次化索引
columns = pd.MultiIndex(levels=[['Left','Right'], ['A','B']],
                        labels=[[0,0,1,1], [0,1,0,1]])

DataFrame(np.arange(4*4).reshape((4,4)), columns=columns, index=index)

Unnamed: 0_level_0,Unnamed: 1_level_0,Left,Left,Right,Right
Unnamed: 0_level_1,Unnamed: 1_level_1,A,B,A,B
Up,X,0,1,2,3
Up,Y,4,5,6,7
Down,X,8,9,10,11
Down,Y,12,13,14,15


In [40]:
data = [[1,4],[2,5],[3,6]]
df = DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

'c1' in df.columns, 'r1' in df.index

(True, True)

Index 的方法与属性

方法 | 说明
---|---
`append` | 连接另一个Index对象，产生一个新的Index
`difference` | 计算差集
`intersection` | 计算交集
`union` | 计算并集
`isin` | 计算一个指标各值是否都把含在参数集合中的布尔型数组
`delete` | 删除索引i处的元素，得到新的Index
`drop` | 删除传入的值，得到新的Index
`insert` | 将元素插入到索引i处，得到新的Index
`is_monotonic` | 当个元素均大于等于前一个元素时，返回True
`is_unique` | 当Index没有重复值，返回True
`unique` | 计算Index中唯一的数组

In [41]:
index1 = pd.Index(['a','b','c'])
index2 = pd.Index(['b','c','d'])

In [42]:
index1.append(index2)

Index([u'a', u'b', u'c', u'b', u'c', u'd'], dtype='object')

In [43]:
index1.difference(index2)

Index([u'a'], dtype='object')

In [44]:
index1.intersection(index2)

Index([u'b', u'c'], dtype='object')

In [45]:
index1.union(index2)

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [46]:
index1.isin(['a','c'])

array([ True, False,  True], dtype=bool)

In [47]:
index1.delete(1)

Index([u'a', u'c'], dtype='object')

In [48]:
index1.drop('b')

Index([u'a', u'c'], dtype='object')

## Essential Functionality
### Reindexing
#### Series

In [49]:
obj = Series([1,2,3,4], index=['a','c','b','d'])
obj

a    1
c    2
b    3
d    4
dtype: int64

In [50]:
# 重新排序，缺失值填0
obj.reindex(['a','b','c','d','e'], fill_value=0)

a    1
b    3
c    2
d    4
e    0
dtype: int64

In [51]:
obj3 = Series(['A','B','C'], index=[1,3,5])
obj3

1    A
3    B
5    C
dtype: object

In [52]:
# ffill 插值处理
obj3.reindex(range(7), method='ffill', fill_value='?')

0    ?
1    A
2    A
3    B
4    B
5    C
6    C
dtype: object

In [53]:
# bfill 插值处理
obj3.reindex(range(7), method='bfill', fill_value='?')

0    A
1    A
2    B
3    B
4    C
5    C
6    ?
dtype: object

#### DataFrame

In [54]:
frame = DataFrame(np.arange(9).reshape((3,3)),
                  columns=['c1','c3','c5'],
                  index=['r1','r3','r5'])
frame

Unnamed: 0,c1,c3,c5
r1,0,1,2
r3,3,4,5
r5,6,7,8


In [55]:
# DataFrame 差值处理
frame.reindex(['r1','r2','r3','r4','r5'], method='ffill')

Unnamed: 0,c1,c3,c5
r1,0,1,2
r2,0,1,2
r3,3,4,5
r4,3,4,5
r5,6,7,8


In [56]:
frame.reindex(['r1','r2','r3','r4','r5']).ffill()

Unnamed: 0,c1,c3,c5
r1,0.0,1.0,2.0
r2,0.0,1.0,2.0
r3,3.0,4.0,5.0
r4,3.0,4.0,5.0
r5,6.0,7.0,8.0


In [57]:
# 利用 ix 做到相同的事情
frame.ix[['r1','r2','r3','r4','r5']].ffill()

Unnamed: 0,c1,c3,c5
r1,0.0,1.0,2.0
r2,0.0,1.0,2.0
r3,3.0,4.0,5.0
r4,3.0,4.0,5.0
r5,6.0,7.0,8.0


In [58]:
# reindex 插值只能在轴0
frame.reindex(columns=['c1','c2','c3','c4','c5'], method='ffill')

Unnamed: 0,c1,c2,c3,c4,c5
r1,0,,1,,2
r3,3,,4,,5
r5,6,,7,,8


In [59]:
# 使用 ix 标签索引配合 ffill(1) 在轴0插值
frame.ix[:, ['c1','c2','c3','c4','c5']].ffill(1)

Unnamed: 0,c1,c2,c3,c4,c5
r1,0.0,0.0,1.0,1.0,2.0
r3,3.0,3.0,4.0,4.0,5.0
r5,6.0,6.0,7.0,7.0,8.0


In [60]:
# 利用ix的标签索引功能，简化索引任务
frame.ix[['r1','r2','r3','r4','r5'],
         ['c1','c2','c3','c4','c5']].ffill(0).ffill(1)

Unnamed: 0,c1,c2,c3,c4,c5
r1,0.0,0.0,1.0,1.0,2.0
r2,0.0,0.0,1.0,1.0,2.0
r3,3.0,3.0,4.0,4.0,5.0
r4,3.0,3.0,4.0,4.0,5.0
r5,6.0,6.0,7.0,7.0,8.0


reindex 函数的参数

参数 | 说明
---|---
`index` | 用作索引的新序列
`method` | 插值方式
`fill_value` | 重新索引过程中，引入缺失值的替代值
`limit` | 向前或向后填充时的最大填充量
`level` | 在MultiIndex的指定级别上匹配简单索引，否则选取其子集
`copy` | 默认为True，无论如何都要复制；如果为False，则新旧相等就不复制

### Dropping entries from an axis

drop 方法返回一个在指定轴上除了指定值的“新对象”

#### Series

In [61]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [62]:
# 丢弃单个项
obj.drop('c')

a    1
b    2
d    4
e    5
dtype: int64

In [63]:
# 丢弃多个项
obj.drop(['b','d'])

a    1
c    3
e    5
dtype: int64

In [64]:
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [65]:
# in-place 删除
del obj['c']
obj

a    1
b    2
d    4
e    5
dtype: int64

#### DataFrame

In [66]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])

In [67]:
# 沿轴0（预设），丢弃多个项
data.drop(['two','four'])

Unnamed: 0,a,b,c,d
one,0,1,2,3
three,8,9,10,11


In [68]:
# 沿轴1，丢弃多个项
data.drop(['b','d'], axis=1)

Unnamed: 0,a,c
one,0,2
two,4,6
three,8,10
four,12,14


### Indexing, selection, and filtering

#### Series

In [69]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [70]:
obj['b'] # 得到值，等同于 obj[1]

2

In [71]:
obj[['b']] # 得到 Series

b    2
dtype: int64

In [72]:
obj[['b','c','d']]

b    2
c    3
d    4
dtype: int64

In [73]:
obj[2:4] # 不包含 4

c    3
d    4
dtype: int64

In [74]:
# 利用标签的切片运算与普通的Python切片不同
obj['b':'d'] # 包含 'd'

b    2
c    3
d    4
dtype: int64

#### DataFrame

In [75]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [76]:
data['b'] # 得到 Series

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [77]:
data[['b','d']] # 得到 DataFrame

Unnamed: 0,b,d
one,1,3
two,5,7
three,9,11
four,13,15


In [78]:
# 通过切片选取 row
data[:2]

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7


In [79]:
# 通过布尔型数组选取 row
data[data['c'] > 6]

Unnamed: 0,a,b,c,d
three,8,9,10,11
four,12,13,14,15


In [80]:
# 通过索引字段ix
data.ix[['two','three'],['b','c']]

Unnamed: 0,b,c
two,5,6
three,9,10


In [81]:
data.ix[2] # 选取row，得到一维的 Series

a     8
b     9
c    10
d    11
Name: three, dtype: int64

In [82]:
data.ix[:, 2] # 选取column，得到一维的 Series

one       2
two       6
three    10
four     14
Name: c, dtype: int64

In [83]:
data.ix[data.b > 7, :2] # 布尔型数组选取 row， 切片选取 column

Unnamed: 0,a,b
three,8,9
four,12,13


DataFrame 的索引选项

类型 | 说明
---|---
`obj[val]` | 选取DataFrame的单个列或一组列。布尔型数组（过滤行），切片（行切片）
`obj.ix[val]` | 选取DataFrame的单个行或一组行
`obj.ix[:, val]` | 选取DataFrame的单个列或一组列
`obj.ix[val1, val2]` | 同时选取行和列
`reindex` | 将一个或多个轴匹配到新索引
`xs` | 根据标签选取单行或单列，返回一个Series
~~`icol`,`irow`~~ | ~~根据整数位置选取单列或单行，返回一个Series~~
`get_value`,`set_value` | 根据行标签和列标签选取单个值

In [84]:
# 根据标签选取
data.xs('two') # 等同 data.ix['two']

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [85]:
data.xs('b', axis=1) # 等同 data.ix[:, 'b']

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [86]:
# 根据整数位置选取
data.iloc[1] # 等同 data.ix[1]

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [87]:
data.iloc[:,1] # 等同 data.ix[:, 1]

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [88]:
# 根据行标签和列标签选取单个值
data.get_value('two','b') # 等同 data.ix['two','b']

5

结论：记忆 ix 的用法就好
- 根据标签选取: ix 可以取代 xs
- 根据整数位置选取: ix 可以取代 iloc
- 根据行标签和列标签选: ix 可以取代 get_value

### Arithmetic and data alignment

In [89]:
s1 = Series([1,2,3,4], index=['a','c','d','e'])
s2 = Series([5,4,3,2,1], index=['a','c','e','f','g'])

s1 + s2

a    6.0
c    6.0
d    NaN
e    7.0
f    NaN
g    NaN
dtype: float64

In [90]:
# 使用 add，配合 fill_value 处理缺失值
s1.add(s2, fill_value=0)

a    6.0
c    6.0
d    3.0
e    7.0
f    2.0
g    1.0
dtype: float64

In [91]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'),index=['O','T','C'])
df2 = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['U','O','T','G'])

df1 + df2

Unnamed: 0,b,c,d,e
C,,,,
G,,,,
O,3.0,,6.0,
T,9.0,,12.0,
U,,,,


In [92]:
# 使用 add，配合 fill_value 参数处理缺失值
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
C,6.0,7.0,8.0,
G,9.0,,10.0,11.0
O,3.0,1.0,6.0,5.0
T,9.0,4.0,12.0,8.0
U,0.0,,1.0,2.0


In [93]:
df1.add(df2, fill_value=0).fillna(0)

Unnamed: 0,b,c,d,e
C,6.0,7.0,8.0,0.0
G,9.0,0.0,10.0,11.0
O,3.0,1.0,6.0,5.0
T,9.0,4.0,12.0,8.0
U,0.0,0.0,1.0,2.0


#### Arithmetic methods with fill values

In [94]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list('abcde'))
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [95]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [96]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


#### Operations between DataFrame and Series

In [97]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [98]:
arr[0]

array([0, 1, 2, 3])

In [99]:
# 广播（broadcasting）
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [100]:
frame = DataFrame(np.arange(12).reshape((4,3)), 
                  columns=['a','b','c'],
                  index=['one','two','three','four'])
frame

Unnamed: 0,a,b,c
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [101]:
series = frame.ix[0]
series

a    0
b    1
c    2
Name: one, dtype: int64

In [102]:
# DataFrame 和 Series 之间的算术运算会将Series的索引匹配到DataFrame的列，让后沿着行一直向下广播
frame - series

Unnamed: 0,a,b,c
one,0,0,0
two,3,3,3
three,6,6,6
four,9,9,9


In [103]:
series3 = frame['b']
series3

one       1
two       4
three     7
four     10
Name: b, dtype: int64

In [104]:
# 匹配 row 且在 column 上广播
# For Series input, axis to match Series index on
frame.sub(series3, axis='index')

Unnamed: 0,a,b,c
one,-1,0,1
two,-1,0,1
three,-1,0,1
four,-1,0,1


### Function application and mapping

In [105]:
frame = DataFrame(np.random.randn(4,3),
                 columns=list('abc'),
                 index=list('ABCD'))
frame

Unnamed: 0,a,b,c
A,-0.602269,0.761001,0.379882
B,-0.046003,0.784621,-0.78875
C,0.909484,0.421516,-0.510604
D,-0.048027,0.464389,-2.831831


In [106]:
frame.apply(np.abs) # 等同 np.abs(frame)

Unnamed: 0,a,b,c
A,0.602269,0.761001,0.379882
B,0.046003,0.784621,0.78875
C,0.909484,0.421516,0.510604
D,0.048027,0.464389,2.831831


In [107]:
# 将函数应用到由各row或column所形成的一维数组上
def f(arr):
    print arr
    return arr.max() - arr.min()

frame.apply(f)

A   -0.602269
B   -0.046003
C    0.909484
D   -0.048027
Name: a, dtype: float64
A    0.761001
B    0.784621
C    0.421516
D    0.464389
Name: b, dtype: float64
A    0.379882
B   -0.788750
C   -0.510604
D   -2.831831
Name: c, dtype: float64


a    1.511753
b    0.363104
c    3.211713
dtype: float64

In [108]:
# 使用匿名函数
frame.apply(lambda arr: arr.max() - arr.min())

a    1.511753
b    0.363104
c    3.211713
dtype: float64

In [109]:
# 传递给apply的函数还可以返回有多个值组成的Series
frame.apply(lambda x: Series([x.max(), x.min()], index=['max','min']))

Unnamed: 0,a,b,c
max,0.909484,0.784621,0.379882
min,-0.602269,0.421516,-2.831831


In [110]:
# DataFrame 使用 applymap 转换每个元素
frame.applymap(lambda x: '%.2f' % x)

Unnamed: 0,a,b,c
A,-0.6,0.76,0.38
B,-0.05,0.78,-0.79
C,0.91,0.42,-0.51
D,-0.05,0.46,-2.83


In [111]:
# Series使用 map 转换每个元素
frame['b'].map(lambda x: '%.2f' % x)

A    0.76
B    0.78
C    0.42
D    0.46
Name: b, dtype: object

In [112]:
frame['b'].apply(lambda x: '%.2f' % x) # 等同 map

A    0.76
B    0.78
C    0.42
D    0.46
Name: b, dtype: object

应用 | 转换一维数组 | 转换每个元素
---|----|---
Series    | - | `map`
DataFrame | `apply` | `applymap`

### Sorting and ranking

In [113]:
obj = Series([4,2,3,1], index=list('dabc'))
obj

d    4
a    2
b    3
c    1
dtype: int64

In [114]:
# 对Series的Index排序
obj.sort_index()

a    2
b    3
c    1
d    4
dtype: int64

In [115]:
# 对Series的values排序
obj.sort_values()

c    1
a    2
b    3
d    4
dtype: int64

In [116]:
frame = DataFrame(np.random.randn(2,4), index=['B','A'], columns=list('dabc'))
frame

Unnamed: 0,d,a,b,c
B,1.611463,0.277091,-0.243687,0.270537
A,-0.491892,1.192095,-0.224605,-0.011758


In [117]:
# 对DataFrame沿轴1排序 index
frame.sort_index()

Unnamed: 0,d,a,b,c
A,-0.491892,1.192095,-0.224605,-0.011758
B,1.611463,0.277091,-0.243687,0.270537


In [118]:
# 对DataFrame的values排序
frame.sort_values(by='a')

Unnamed: 0,d,a,b,c
B,1.611463,0.277091,-0.243687,0.270537
A,-0.491892,1.192095,-0.224605,-0.011758


In [119]:
# 对DataFrame沿轴1排序 columns
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
B,0.277091,-0.243687,0.270537,1.611463
A,1.192095,-0.224605,-0.011758,-0.491892


In [120]:
# 对DataFrame沿轴1排序 columns（降幂）
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
B,1.611463,0.270537,-0.243687,0.277091
A,-0.491892,-0.011758,-0.224605,1.192095


In [121]:
obj = Series([1,3,4,2])

# 对Series的值排序
obj.sort_values()

0    1
3    2
1    3
2    4
dtype: int64

In [122]:
frame = DataFrame({'b': [3,2,2,1], 'a': [0,1,0,1]})

# 对DataFrame的一个列中的值进行排序
frame.sort_values(by='b')

Unnamed: 0,a,b
3,1,1
1,1,2
2,0,2
0,0,3


In [123]:
# 对DataFrame的多个列中的值进行排序
frame.sort_values(by=['b','a'])

Unnamed: 0,a,b
3,1,1
2,0,2
1,1,2
0,0,3


In [124]:
obj = Series([7,-5,7,4,2,0,4])

# rank 是通过“为各组分配一个平均排名”的方式破坏平级关系 ??
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

### Axis indexes with duplicate values

In [125]:
obj = Series(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [126]:
obj.index.is_unique

False

In [127]:
obj['a']

a    0
a    1
dtype: int64

In [128]:
obj['c']

4

In [129]:
df = DataFrame(np.random.randn(4,3), index=list('aabb'))
df.ix['b']

Unnamed: 0,0,1,2
b,0.840099,-1.043301,0.533516
b,-0.565532,-0.958874,0.024497


## Summarizing and Computing Descriptive Statistics

In [130]:
df = DataFrame(np.random.randn(4,5))
df.ix[1:2,1:2] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,-0.13708,0.231455,-1.313,0.420746,-0.518844
1,-2.210741,,,0.004617,-0.516801
2,1.459757,,,-0.406594,-0.28649
3,0.194204,-0.619402,-1.900504,-0.92385,0.296863


In [131]:
df.sum()

0   -0.693860
1   -0.387947
2   -3.213504
3   -0.905081
4   -1.025272
dtype: float64

In [132]:
# NA只会自动排除，除非整个切片都是NA
df.sum(axis=1)

0   -1.316724
1   -2.722924
2    0.766673
3   -2.952689
dtype: float64

In [133]:
# 通过skipna选项可以禁用该功能
df.sum(axis=1, skipna=False)

0   -1.316724
1         NaN
2         NaN
3   -2.952689
dtype: float64

In [134]:
# 获得最大值的索引值
df.idxmax()

0    2
1    0
2    0
3    0
4    3
dtype: int64

In [135]:
# 样本值的累计和
df.cumsum()

Unnamed: 0,0,1,2,3,4
0,-0.13708,0.231455,-1.313,0.420746,-0.518844
1,-2.347821,,,0.425363,-1.035645
2,-0.888064,,,0.018769,-1.322135
3,-0.69386,-0.387947,-3.213504,-0.905081,-1.025272


In [136]:
# 针对Series或各DataFrame列计算汇总统计
df.describe()

Unnamed: 0,0,1,2,3,4
count,4.0,2.0,2.0,4.0,4.0
mean,-0.173465,-0.193974,-1.606752,-0.22627,-0.256318
std,1.522543,0.601646,0.415428,0.574768,0.384574
min,-2.210741,-0.619402,-1.900504,-0.92385,-0.518844
25%,-0.655495,-0.406688,-1.753628,-0.535908,-0.517311
50%,0.028562,-0.193974,-1.606752,-0.200988,-0.401645
75%,0.510592,0.01874,-1.459876,0.108649,-0.140652
max,1.459757,0.231455,-1.313,0.420746,0.296863


### Correlation and Covariance

#### [标准差 (Standard Deviation)](http://wiki.mbalib.com/wiki/%E6%A0%87%E5%87%86%E5%B7%AE)

标准差是一种表示分散程度的统计观念。

标准差是一组数值自平均值分散开来的程度的一种测量观念。一个较大的标准差，代表大部分的数值和其平均值之间差异较大；一个较小的标准差，代表这些数值较接近平均值。

![](http://wiki.mbalib.com/w/images/math/6/3/3/6336e4c48fd253b7a6f552fa2579525b.png)

#### [协方差 (Covariance,COV)](http://wiki.mbalib.com/wiki/%E5%8D%8F%E6%96%B9%E5%B7%AE)

在概率论和统计学中，协方差用于衡量两个变量的总体误差。

如果两个变量的变化趋势一致，也就是说如果其中一个大于自身的期望值，另外一个也大于自身的期望值，那么两个变量之间的协方差就是正值。

如果两个变量的变化趋势相反，即其中一个大于自身的期望值，另外一个却小于自身的期望值，那么两个变量之间的协方差就是负值。

![](http://wiki.mbalib.com/w/images/math/5/5/d/55dab8cc069454a52205d8f171828296.png)

#### [相关系数(Correlation coefficient)](http://wiki.mbalib.com/wiki/%E7%9B%B8%E5%85%B3%E7%B3%BB%E6%95%B0)

相关系数是用以反映变量之间相关关系密切程度的统计指标。

![](http://wiki.mbalib.com/w/images/math/5/c/1/5c1a85509fe16d26ce3bbf472e3f37e0.png)

相关系数ρXY取值在-1到1之间，
- ρXY = 0时，称X,Y不相关； 
- | ρXY | = 1时，称X,Y完全相关，此时，X,Y之间具有线性函数关系；
- | ρXY | < 1时，X的变动引起Y的部分变动
- 0.8 < | ρXY | < 1 时称为高度相关
- 0 < | ρXY | < 0.3时，称为低度相关
- 0.3 ≤ | ρXY | ≤ 0.8 为中度相关

In [137]:
import pandas_datareader.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'] for tic, data, in all_data.iteritems()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})

In [138]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.034339,0.011117,0.004385,0.002587
2009-12-28,0.012294,0.007098,0.013326,0.005484
2009-12-29,-0.011861,-0.005571,-0.003477,0.007058
2009-12-30,0.012147,0.005376,0.005461,-0.013699
2009-12-31,-0.0043,-0.004416,-0.012597,-0.015504


In [139]:
returns.MSFT.corr(returns.IBM)

0.49597963862836758

In [140]:
# corr 将以DataFrame形式返回完整的相关系数
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.470676,0.410011,0.424305
GOOG,0.470676,1.0,0.390689,0.443587
IBM,0.410011,0.390689,1.0,0.49598
MSFT,0.424305,0.443587,0.49598,1.0


In [141]:
# cov 将以DataFrame形式返回完整的协方差矩阵
# 协方差：一种用来度量两个随机变量关系的统计量
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.001027,0.000303,0.000252,0.000309
GOOG,0.000303,0.00058,0.000142,0.000205
IBM,0.000252,0.000142,0.000367,0.000216
MSFT,0.000309,0.000205,0.000216,0.000516


In [142]:
# 利用corrwith，可以计算其列或行跟另一个Series或DataFrame之间的相关系数
# 传入一个Series会返回一个相关系数值Series(针对各列进行计算)
returns.corrwith(returns.IBM)

AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64

In [143]:
# 传入一个DataFrame会计算按列名配对的相关系数
# 范例：计算百分比变化与成交量的相关系数
returns.corrwith(volume)

AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

### Unique Values, Value Counts, and Membership

In [144]:
obj = Series(['c','a','d','a','a','b','b','c','c'])

In [145]:
# 取得Series中唯一值数组
unique = obj.unique()
unique

array(['c', 'a', 'd', 'b'], dtype=object)

In [146]:
# 排序
unique.sort()
unique

array(['a', 'b', 'c', 'd'], dtype=object)

In [147]:
# 计算个值出现的频率
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [148]:
# 排序(生幂)
pd.value_counts(obj.values, sort=True, ascending=True)

d    1
b    2
a    3
c    3
dtype: int64

In [149]:
# isin 判断矢量化集合的成员资格
mask = obj.isin(['b','c'])
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [150]:
data = DataFrame({'Q1': [1,2,3,3,4],
                  'Q2': [2,3,1,2,3],
                  'Q3': [1,5,2,4,4]})
data

Unnamed: 0,Q1,Q2,Q3
0,1,2,1
1,2,3,5
2,3,1,2
3,3,2,4
4,4,3,4


In [151]:
# 将 pandas.value_counts 传给 DataFrame apply 函数，计算每个 column 各值出现的频率
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Q1,Q2,Q3
1,1.0,1.0,1.0
2,1.0,2.0,1.0
3,2.0,2.0,0.0
4,1.0,0.0,2.0
5,0.0,0.0,1.0


## Handling Missing Data

NA 处理方式

方法 | 说明
---|---
`isnull` | 返回一个含有布尔值的对象，表示那些值是缺失值
`notnull` | isnull的否定式
`dropna` | 根据各标签的值是否存在缺失数据对轴标签进行过滤
`fillna` | 用指定值或插值方法（ffill, bfill)填充缺失数据

In [152]:
string_data = Series(['aaa','bbb',np.nan,'ddd'])
string_data

0    aaa
1    bbb
2    NaN
3    ddd
dtype: object

In [153]:
# pandas 使用浮点值 NAN 表示浮点和非浮点数组中的缺失值
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [154]:
# Python 内置的 None 也会被当做NAN处理
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [155]:
from numpy import nan as NA

In [156]:
data = Series([1, NA, 3.5, NA, 7])

# 返回一个仅含非空数据和索引值的Series
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [157]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [158]:
data = DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [159]:
# dropna 默认丢弃任何含有缺失值的 row
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [160]:
# 只丢弃全为NA的 row
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [161]:
# 只丢弃两个NA以上（含）的 row
data.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,6.5,3.0


In [162]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [163]:
# 丢弃为NA的 column
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### Filling in Missing Data

In [164]:
# 使用 fillna 替换缺失值
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [165]:
# 通过字典调用 fillna，对不同 column 填充不同的值
data.fillna({1:1, 2:2})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,1.0,2.0,
2,,1.0,2.0,
3,,6.5,3.0,


In [166]:
# 使用 inplace=True 对对象进行修改
data.fillna(0, inplace=True)
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [167]:
df = DataFrame(np.random.randn(6,3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.810258,0.99069,-0.628712
1,-1.065162,2.036433,-0.045309
2,-0.814353,,0.874937
3,0.274365,,-2.095261
4,-0.982951,,
5,-0.36402,,


In [168]:
# 使用 ffill 插值
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.810258,0.99069,-0.628712
1,-1.065162,2.036433,-0.045309
2,-0.814353,2.036433,0.874937
3,0.274365,2.036433,-2.095261
4,-0.982951,2.036433,-2.095261
5,-0.36402,2.036433,-2.095261


In [169]:
# 使用平均数做插值
data = Series([1, NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna 函数的参数

参数 | 说明
---|---
`value` | 用于填充缺失值的标量值或字典对象
`method` | 插值方式，如果函数调用时未指定其他参数的话，默认为 ffill
`axis` | 带填充的轴，默认 axis=0
`inplace` | 修改调用者对象而不产生副本
`limit` | 可连续填充的最大数量

## Hierarchical Indexing

In [170]:
data = Series(np.random.randn(10),
             index=[list('aaabbbccdd'),[1,2,3,1,2,3,1,2,2,3]])
data

a  1    1.329913
   2   -1.278869
   3   -0.716493
b  1   -0.454294
   2   -1.263972
   3    0.227562
c  1   -0.450813
   2   -0.849565
d  2    0.539701
   3   -2.510157
dtype: float64

In [171]:
# MultiIndex
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [172]:
data['b'] # 等同 data.ix['b']

1   -0.454294
2   -1.263972
3    0.227562
dtype: float64

In [173]:
data['b':'c'] # 等同 data.ix[['b','c']]

b  1   -0.454294
   2   -1.263972
   3    0.227562
c  1   -0.450813
   2   -0.849565
dtype: float64

In [174]:
# 在内层选取
data[:, 2] # 等同 data.ix[:, 2]

a   -1.278869
b   -1.263972
c   -0.849565
d    0.539701
dtype: float64

In [175]:
# 通过 unstack 重新安排，变成新的 DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,1.329913,-1.278869,-0.716493
b,-0.454294,-1.263972,0.227562
c,-0.450813,-0.849565,
d,,0.539701,-2.510157


In [176]:
# DataFrame 分层索引
frame = DataFrame(np.arange(12).reshape((4,3)),
                 index=[['a','a','b','b'], [1,2,1,2]],
                 columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [177]:
# 索引命名
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [178]:
frame['Ohio'] # 等同 frame.ix[:, 'Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [179]:
# 找出 column 内层，再选取分组
frame.swaplevel(axis=1).ix[:,'Green']

Unnamed: 0_level_0,state,Ohio,Colorado
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,2
a,2,3,5
b,1,6,8
b,2,9,11


In [180]:
frame.ix['a']

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


### Reordering and Sorting Levels

In [181]:
# swaplevel 接受两个级别编号或名称，返回一个互换级别的新对象
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [182]:
# 排序
frame.swaplevel(0,1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [183]:
# 互换 column's level，并排序
frame.swaplevel(0,1,axis=1).sortlevel(0,axis=1)

Unnamed: 0_level_0,color,Green,Green,Red
Unnamed: 0_level_1,state,Colorado,Ohio,Ohio
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


### Summary Statistics by Level

In [184]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [185]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### Using a DataFrame’s Columns

将 DataFrame 的一个或多个 column 当做 row 索引来来用，或希望将 row 索引变成 column

ps. 思考跟第七章的 stack, unstack 有什么不同

In [186]:
frame = DataFrame({
    'a': range(7),
    'b': range(7,0,-1),
    'c': ['one','one','one','two','two','two','two'],
    'd': [0,1,2,0,1,2,3]
})

frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [187]:
# set_index 将一个或多个 column 转换成 row index
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [188]:
# 保留 column
frame.set_index(['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [189]:
# reset_index 功能和 set_index 相反，层次化索引的级别会被转移到 column 里面
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [190]:
frame2.reset_index(level=1)

Unnamed: 0_level_0,d,a,b
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


## Other pandas Topics
### Integer Indexing

In [191]:
ser = Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int64

In [192]:
# 虽然一个含有 0,1,2 的索引，的是很难推断用户想要什么
# ser[-1]
# ser.ix[-1]

In [193]:
ser2 = Series(np.arange(3), index=['a','b','c'])
ser

0    0
1    1
2    2
dtype: int64

In [194]:
# 对于一个非整数索引，就没有歧义
ser2[-1]

2

In [195]:
ser2.ix[:1]

a    0
dtype: int64

In [196]:
# 使用 iloc 定位取值
ser3 = Series(range(3), index=[-5, 1, 3])
ser3.iloc[2]

2

In [197]:
# 使用 iloc 定位取值
frame = DataFrame(np.arange(6).reshape((3,2)), index=[2,0,1])
frame.iloc[2]

0    4
1    5
Name: 1, dtype: int64

### Panel Data

Panel 数据结构，可以将其看成一个三维的 DataFrame

In [198]:
# 移除之前 dict 定义
del dict

In [199]:
import pandas_datareader.data as web

# 用一个 DataFrame 对象组成的字典或一个三维 ndarray 来创建 Panel 对象
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk, '1/5/2009', '1/10/2009')) 
                      for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))

# Panel 的每一项（类似 DataFrame 的列）都是一个 DataFrame
# 项轴：'items'，'AAPL', 'GOOG', 'MSFT', 'DELL'
# 轴0：'major'， 日期
# 轴1：'minor'，'Open','High','Low','Close','Volume','Adj Close'
pdata

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 5 (major_axis) x 6 (minor_axis)
Items axis: AAPL to MSFT
Major_axis axis: 2009-01-05 00:00:00 to 2009-01-09 00:00:00
Minor_axis axis: Open to Adj Close

In [200]:
pdata.items

Index([u'AAPL', u'DELL', u'GOOG', u'MSFT'], dtype='object')

In [201]:
pdata.major_axis

DatetimeIndex(['2009-01-05', '2009-01-06', '2009-01-07', '2009-01-08',
               '2009-01-09'],
              dtype='datetime64[ns]', name=u'Date', freq='D')

In [202]:
pdata.minor_axis

Index([u'Open', u'High', u'Low', u'Close', u'Volume', u'Adj Close'], dtype='object')

pdata 由四个 item（轴0：'AAPL', 'GOOG', 'MSFT', 'DELL'）构成
- 轴1：日期
- 轴2：'Open','High','Low','Close','Volume','Adj Close'

In [203]:
# 第一个item
pdata['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-05,93.170003,96.179998,92.709999,94.580002,295402100.0,12.253743
2009-01-06,95.95,97.170001,92.389998,93.02,322327600.0,12.051629
2009-01-07,91.809999,92.500001,90.260003,91.01,188262200.0,11.791215
2009-01-08,90.43,93.150002,90.039998,92.699999,168375200.0,12.01017
2009-01-09,93.209997,93.380001,90.14,90.579997,136711400.0,11.735504


In [204]:
# item 与 minor 轴互换
pdata = pdata.swapaxes('items', 'minor')

# 互换后：
# 项轴：'minor'，'Open','High','Low','Close','Volume','Adj Close'
# 轴0：'major'， 日期
# 轴1：'items'，'AAPL', 'GOOG', 'MSFT', 'DELL'

In [205]:
# 显示 日期 与 股票 在 'Adj Close' 的表现
pdata['Adj Close']

Unnamed: 0_level_0,AAPL,DELL,GOOG,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-05,12.253743,10.26359,163.861421,16.555156
2009-01-06,12.051629,10.68922,166.86342,16.748784
2009-01-07,11.791215,10.78596,160.844427,15.740307
2009-01-08,12.01017,10.90204,162.43284,16.232444
2009-01-09,11.735504,10.75694,157.377899,15.748375


In [206]:
# ix[项轴, 轴0, 轴1] 标签索引被推广到三个维度，可以选取指定日期的所有数据
pdata.ix[:, '1/7/2009', :]

Unnamed: 0,Open,High,Low,Close,Volume,Adj Close
AAPL,91.809999,92.500001,90.260003,91.01,188262200.0,11.791215
DELL,10.83,11.43,10.81,11.15,29947300.0,10.78596
GOOG,328.320555,330.910553,318.750561,322.010543,9022600.0,160.844427
MSFT,20.190001,20.290001,19.48,19.51,72709900.0,15.740307


In [207]:
# 选取指定观察对象（Adj Close）的所有数据
pdata.ix['Adj Close', '1/7/2009':, :]

Unnamed: 0_level_0,AAPL,DELL,GOOG,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-07,11.791215,10.78596,160.844427,15.740307
2009-01-08,12.01017,10.90204,162.43284,16.232444
2009-01-09,11.735504,10.75694,157.377899,15.748375


In [208]:
# 将 Panel 转换成 DataFrame
stacked = pdata.to_frame()
stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume,Adj Close
Date,minor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-01-05,AAPL,93.170003,96.179998,92.709999,94.580002,295402100.0,12.253743
2009-01-05,DELL,10.82,10.9,10.53,10.61,18926700.0,10.26359
2009-01-05,GOOG,321.000544,331.240583,315.000528,328.050564,9814500.0,163.861421
2009-01-05,MSFT,20.200001,20.67,20.059999,20.52,61475200.0,16.555156
2009-01-06,AAPL,95.95,97.170001,92.389998,93.02,322327600.0,12.051629
2009-01-06,DELL,10.83,11.34,10.65,11.05,37140800.0,10.68922
2009-01-06,GOOG,332.980573,340.800591,326.390554,334.060566,12898500.0,166.86342
2009-01-06,MSFT,20.75,21.0,20.610001,20.76,58083400.0,16.748784
2009-01-07,AAPL,91.809999,92.500001,90.260003,91.01,188262200.0,11.791215
2009-01-07,DELL,10.83,11.43,10.81,11.15,29947300.0,10.78596


In [209]:
# to_frame() 的逆运算
pdata2 = stacked.to_panel()

pdata2['Adj Close']

minor,AAPL,DELL,GOOG,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-05,12.253743,10.26359,163.861421,16.555156
2009-01-06,12.051629,10.68922,166.86342,16.748784
2009-01-07,11.791215,10.78596,160.844427,15.740307
2009-01-08,12.01017,10.90204,162.43284,16.232444
2009-01-09,11.735504,10.75694,157.377899,15.748375
