# Getting Started with pandas

In [1]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

## Introduction to pandas Data Structures
### Series

In [2]:
obj = Series([1,2,3])
obj

0    1
1    2
2    3
dtype: int64

In [3]:
obj.values

array([1, 2, 3])

In [4]:
obj.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
obj2 = Series([1,2,3], index=['a','b','c'])
obj2

a    1
b    2
c    3
dtype: int64

In [6]:
obj2['b'] = 6
obj2

a    1
b    6
c    3
dtype: int64

In [7]:
obj2[obj2 > 3]

b    6
dtype: int64

In [8]:
obj2 * 2

a     2
b    12
c     6
dtype: int64

In [9]:
'b' in obj2

True

In [10]:
'd' in obj2

False

In [11]:
# 透过字典建立 Series（Series的索引是字典的键）
dict = {'apple': 1, 'banana': 2, 'cherry': 3}
obj3 = Series(dict)
obj3

apple     1
banana    2
cherry    3
dtype: int64

In [12]:
# 与fruits索引匹配的值会被找出放到相对应的位置
fruits = ['cherry', 'banana', 'apple', 'date']
obj4 = Series(dict, fruits)
obj4

cherry    3.0
banana    2.0
apple     1.0
date      NaN
dtype: float64

In [13]:
pd.isnull(obj4)

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [14]:
pd.notnull(obj4)

cherry     True
banana     True
apple      True
date      False
dtype: bool

In [15]:
obj4.isnull()

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [16]:
obj4.index.name = 'fruit'
obj4.name = "myName"
obj4

fruit
cherry    3.0
banana    2.0
apple     1.0
date      NaN
Name: myName, dtype: float64

In [17]:
# 对index赋值
obj.index = ['A','B','C']
obj

A    1
B    2
C    3
dtype: int64

### DataFrame

In [18]:
data = {
    'value1': ['A','B','C','D','E'],
    'value2': [1,2,3,4,5],
    'value3': [1.1,2.2,3.3,4.4,5.5]
}

frame = DataFrame(data)
frame

Unnamed: 0,value1,value2,value3
0,A,1,1.1
1,B,2,2.2
2,C,3,3.3
3,D,4,4.4
4,E,5,5.5


In [19]:
# 指定顺序
DataFrame(data, columns=['value2','value3','value1'])

Unnamed: 0,value2,value3,value1
0,1,1.1,A
1,2,2.2,B
2,3,3.3,C
3,4,4.4,D
4,5,5.5,E


In [20]:
# 若传入colomn在数据中找不到，会产生NA
frame2 = DataFrame(data,
                   columns=['value3','value2','value1','value0'],
                   index=['one','two','three','four','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,
two,2.2,2,B,
three,3.3,3,C,
four,4.4,4,D,
five,5.5,5,E,


In [21]:
# 透过字典标记方式存取
# 返回的Series跟DataFrame有相同的索引
frame2['value2']

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [22]:
# 透过属性方式存取
frame2.value2

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [23]:
# row也可以通过位置或名称的方式获取
frame2.ix['three']

value3    3.3
value2      3
value1      C
value0    NaN
Name: three, dtype: object

In [24]:
# column可以通过赋值的方式进行修改
frame2['value0'] = np.arange(5,0,-1)
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,5
two,2.2,2,B,4
three,3.3,3,C,3
four,4.4,4,D,2
five,5.5,5,E,1


In [25]:
# 将列表或数组赋值给某column，长度要跟DataFrame的长度匹配
frame2['value0'] = [1,2,3,4,5]
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1
two,2.2,2,B,2
three,3.3,3,C,3
four,4.4,4,D,4
five,5.5,5,E,5


In [26]:
frame2['value0'] = np.array(['a','b','c','d','e'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,a
two,2.2,2,B,b
three,3.3,3,C,c
four,4.4,4,D,d
five,5.5,5,E,e


In [27]:
# 如果赋值的是Series，会精确匹配DataFrame的索引，空位会被填上缺失值
frame2['value0'] = Series([1,3,5], index=["one",'three','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [28]:
# 为不存在的column赋值，会创建出新的column
frame2['value4'] = frame2.value2 > 3
frame2

Unnamed: 0,value3,value2,value1,value0,value4
one,1.1,1,A,1.0,False
two,2.2,2,B,,False
three,3.3,3,C,3.0,False
four,4.4,4,D,,True
five,5.5,5,E,5.0,True


In [29]:
# del 用于删除 column
del frame2['value4']
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [30]:
# 将字典传给DataFrame: 外层字典的键为 column 索引, 内层的健位 row 索引
data = {
    'c1': {'r1': 1, 'r2': 2, 'r3': 3},
    'c2': {'r1': 'A', 'r2': 'B', 'r3': 'C'}
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


In [31]:
DataFrame(data, index=['r0','r1','r2'])

Unnamed: 0,c1,c2
r0,,
r1,1.0,A
r2,2.0,B


In [32]:
# 由Series组成的字典
data = {
    'c1': Series([1,2,3], index=['r1','r2','r3']),
    'c2': Series(['A','B','C'], index=['r1','r2','r3'])
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


可以传给DataFrame构造器的数据

类型 | 说明
---|---
2D ndarray | A matrix of data, passing optional row and column labels
dict of arrays, lists, or tuples | Each sequence becomes a column in the DataFrame. All sequences must be the same length.
NumPy structured/record array | Treated as the “dict of arrays” case
dict of Series | Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
dict of dicts | Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
list of dicts or Series | Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
List of lists or tuples | Treated as the “2D ndarray” case
Another DataFrame | The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray | Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result

In [33]:
# 2D ndarray
data = [[1,4],[2,5],[3,6]]
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [34]:
# dict of lists
data = {
    'c1': [1,2,3],
    'c2': [4,5,6]}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [35]:
# dict of arrays
data = {
    'c1': np.array([1,2,3]),
    'c2': np.array([4,5,6])}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [36]:
# dict of tuples
data = {
    'c1': (1,2,3),
    'c2': (4,5,6)}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


### Index Objects

In [37]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index

Index([u'a', u'b', u'c'], dtype='object')

In [38]:
# index 对象是不可修改的
# index[1] = 1

Pandas 中主要的 index 对象

类 | 说明
---|---
`Index` | 最泛化的Index对象，将轴标签表示为一个由Python对象组成的NumPy数组
`Int64Index` | 针对整数的特殊Index
`MultiIndex` | 层次化索引对象，表示单个轴上的多层索引
`DatetimeIndex` | 储存纳秒级时间戳
`PeriodIndex` | 针对Period数据的特殊Index

In [39]:
data = [[1,4],[2,5],[3,6]]
df = DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

'c1' in df.columns, 'r1' in df.index

(True, True)

Index 的方法与属性

方法 | 说明
---|---
`append` | 连接另一个Index对象，产生一个新的Index
`difference` | 计算差集
`intersection` | 计算交集
`union` | 计算并集
`isin` | 计算一个指标各值是否都把含在参数集合中的布尔型数组
`delete` | 删除索引i处的元素，得到新的Index
`drop` | 删除传入的值，得到新的Index
`insert` | 将元素插入到索引i处，得到新的Index
`is_monotonic` | 当个元素均大于等于前一个元素时，返回True
`is_unique` | 当Index没有重复值，返回True
`unique` | 计算Index中唯一的数组

In [40]:
index1 = pd.Index(['a','b','c'])
index2 = pd.Index(['b','c','d'])

In [41]:
index1.append(index2)

Index([u'a', u'b', u'c', u'b', u'c', u'd'], dtype='object')

In [42]:
index1.difference(index2)

Index([u'a'], dtype='object')

In [43]:
index1.intersection(index2)

Index([u'b', u'c'], dtype='object')

In [44]:
index1.union(index2)

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [45]:
index1.isin(['a','c'])

array([ True, False,  True], dtype=bool)

In [46]:
index1.delete(1)

Index([u'a', u'c'], dtype='object')

In [47]:
index1.drop('b')

Index([u'a', u'c'], dtype='object')

## Essential Functionality
### Reindexing

In [48]:
obj = Series([1,2,3,4], index=['a','c','b','d'])
obj

a    1
c    2
b    3
d    4
dtype: int64

In [49]:
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
obj2

a    1
b    3
c    2
d    4
e    0
dtype: int64

In [50]:
# 插值处理
obj3 = Series(['A','B','C'], index=[1,3,5])
obj3.reindex(range(7), method='ffill', fill_value='?')

0    ?
1    A
2    A
3    B
4    B
5    C
6    C
dtype: object

In [51]:
obj3.reindex(range(7), method='bfill', fill_value='?')

0    A
1    A
2    B
3    B
4    C
5    C
6    ?
dtype: object

In [52]:
frame = DataFrame(np.arange(9).reshape((3,3)),
                  columns=['c1','c3','c5'],
                  index=['r1','r3','r5'])
frame

Unnamed: 0,c1,c3,c5
r1,0,1,2
r3,3,4,5
r5,6,7,8


In [53]:
frame.reindex(['r1','r2','r3','r4','r5'], method='ffill')

Unnamed: 0,c1,c3,c5
r1,0,1,2
r2,0,1,2
r3,3,4,5
r4,3,4,5
r5,6,7,8


In [54]:
# 插值只能在轴0
frame.reindex(columns=['c1','c2','c3','c4','c5'])

Unnamed: 0,c1,c2,c3,c4,c5
r1,0,,1,,2
r3,3,,4,,5
r5,6,,7,,8


In [55]:
# 利用ix的标签索引功能，简化索引任务
frame.ix[['r1','r2','r3','r4','r5'], ['c1','c2','c3','c4','c5']]

Unnamed: 0,c1,c2,c3,c4,c5
r1,0.0,,1.0,,2.0
r2,,,,,
r3,3.0,,4.0,,5.0
r4,,,,,
r5,6.0,,7.0,,8.0


reindex 函数的参数

参数 | 说明
---|---
`index` | 用作索引的新序列
`method` | 插值方式
`fill_value` | 重新索引过程中，引入缺失值的替代值
`limit` | 向前或向后填充时的最大填充量
`level` | 在MultiIndex的指定级别上匹配简单索引，否则选取其子集
`copy` | 默认为True，无论如何都要复制；如果为False，则新旧相等就不复制

### Dropping entries from an axis

In [56]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj.drop('c')

a    1
b    2
d    4
e    5
dtype: int64

In [57]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])
data.drop(['two','four'])

Unnamed: 0,a,b,c,d
one,0,1,2,3
three,8,9,10,11


In [58]:
data.drop(['b','d'], axis=1)

Unnamed: 0,a,c
one,0,2
two,4,6
three,8,10
four,12,14


### Indexing, selection, and filtering

In [59]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [60]:
obj['b'], obj[1]

(2, 2)

In [61]:
obj[['b','c','d']]

b    2
c    3
d    4
dtype: int64

In [62]:
obj[2:4]

c    3
d    4
dtype: int64

In [63]:
# 利用标签的切片运算与普通的Python切片不同
obj['b':'d']

b    2
c    3
d    4
dtype: int64

In [64]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [65]:
data['b']

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [66]:
data[['b','d']]

Unnamed: 0,b,d
one,1,3
two,5,7
three,9,11
four,13,15


In [67]:
# 通过切片选取 row
data[:2]

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7


In [68]:
data['c'] > 6

one      False
two      False
three     True
four      True
Name: c, dtype: bool

In [69]:
# 通过布尔型数组选取 row
data[data['c'] > 6]

Unnamed: 0,a,b,c,d
three,8,9,10,11
four,12,13,14,15


In [70]:
# 通过索引字段ix
data.ix[['two','three'],['b','c']]

Unnamed: 0,b,c
two,5,6
three,9,10


In [71]:
data.ix[2]

a     8
b     9
c    10
d    11
Name: three, dtype: int64

In [72]:
data.ix[:, 2]

one       2
two       6
three    10
four     14
Name: c, dtype: int64

In [73]:
data.ix[data.b > 7, :2]

Unnamed: 0,a,b
three,8,9
four,12,13


DataFrame 的索引选项

类型 | 说明
---|---
`obj[val]` | 选取DataFrame的单个列或一组列。布尔型数组（过滤行），切片（行切片）
`obj.ix[val]` | 选取DataFrame的单个行或一组行
`obj.ix[:, val]` | 选取DataFrame的单个列或一组列
`obj.ix[val1, val2]` | 同时选取行和列
`reindex` | 将一个或多个轴匹配到新索引
`xs` | 根据标签选取单行或单列，返回一个Series
~~`icol`,`irow`~~ | ~~根据整数位置选取单列或单行，返回一个Series~~
`get_value`,`set_value` | 根据行标签和列标签选取单个值

In [74]:
data.xs('two')

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [75]:
data.xs('b', axis=1)

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [76]:
data.iloc[1]

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [77]:
data.iloc[:,1]

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [78]:
data.get_value('two','b')

5

### Arithmetic and data alignment

In [79]:
s1 = Series([1,2,3,4], index=['a','c','d','e'])
s2 = Series([5,4,3,2,1], index=['a','c','e','f','g'])
s1+s2

a    6.0
c    6.0
d    NaN
e    7.0
f    NaN
g    NaN
dtype: float64

In [80]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'),index=['o','t','c'])
df2 = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['u','o','t','g'])
df1 + df2

Unnamed: 0,b,c,d,e
c,,,,
g,,,,
o,3.0,,6.0,
t,9.0,,12.0,
u,,,,


#### Arithmetic methods with fill values

In [81]:
df1 = DataFrame(np.arange(12).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20).reshape((4,5)), columns=list('abcde'))
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [82]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [83]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


#### Operations between DataFrame and Series

In [84]:
arr = np.arange(12).reshape((3,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [85]:
arr[0]

array([0, 1, 2, 3])

In [86]:
# 广播（broadcasting）
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [87]:
frame = DataFrame(np.arange(12).reshape((4,3)), 
                  columns=['a','b','c'],
                  index=['one','two','three','four'])
frame

Unnamed: 0,a,b,c
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [88]:
series = frame.ix[0]
series

a    0
b    1
c    2
Name: one, dtype: int64

In [89]:
# DataFrame 和 Series 之间的算术运算会将Series的索引匹配到DataFrame的列，让后沿着行一直向下广播
frame - series

Unnamed: 0,a,b,c
one,0,0,0
two,3,3,3
three,6,6,6
four,9,9,9


In [90]:
series3 = frame['b']
series3

one       1
two       4
three     7
four     10
Name: b, dtype: int64

In [91]:
# 匹配 row 且在 column 上广播
# For Series input, axis to match Series index on
frame.sub(series3, axis='index')

Unnamed: 0,a,b,c
one,-1,0,1
two,-1,0,1
three,-1,0,1
four,-1,0,1


### Function application and mapping

In [92]:
frame = DataFrame(np.random.randn(4,3),
                 columns=list('abc'),
                 index=list('ABCD'))
frame

Unnamed: 0,a,b,c
A,0.342741,-0.11707,-1.407552
B,-0.370967,-0.492514,1.16483
C,-0.56108,0.520674,-0.764324
D,-1.19397,-0.144952,0.616296


In [93]:
np.abs(frame)

Unnamed: 0,a,b,c
A,0.342741,0.11707,1.407552
B,0.370967,0.492514,1.16483
C,0.56108,0.520674,0.764324
D,1.19397,0.144952,0.616296


In [94]:
frame.apply(np.abs)

Unnamed: 0,a,b,c
A,0.342741,0.11707,1.407552
B,0.370967,0.492514,1.16483
C,0.56108,0.520674,0.764324
D,1.19397,0.144952,0.616296


In [95]:
# 将函数应用到由各row或column所形成的一维数组上
frame.apply(lambda x: x.max() - x.min())

a    1.536711
b    1.013189
c    2.572381
dtype: float64

In [96]:
# 传递给apply的函数还可以返回有多个值组成的Series
frame.apply(lambda x: Series([x.max(), x.min()], index=['max','min']))

Unnamed: 0,a,b,c
max,0.342741,0.520674,1.16483
min,-1.19397,-0.492514,-1.407552


In [97]:
# 元素级的Python函数也是可以用
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,a,b,c
A,0.34,-0.12,-1.41
B,-0.37,-0.49,1.16
C,-0.56,0.52,-0.76
D,-1.19,-0.14,0.62


In [98]:
# Series使用map方法应用于元素级函数
frame['b'].map(format)

A    -0.12
B    -0.49
C     0.52
D    -0.14
Name: b, dtype: object

### Sorting and ranking

In [99]:
obj = Series(range(4), index=list('dabc'))

# 对Series的Index排序
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [100]:
frame = DataFrame(np.random.randn(2,4), index=['B','A'], columns=list('dabc'))

# 对DataFrame的Index排序
frame.sort_index()

Unnamed: 0,d,a,b,c
A,-0.372358,-0.223707,-0.012527,-0.767089
B,0.158945,-0.315321,-0.755771,0.071047


In [101]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
B,-0.315321,-0.755771,0.071047,0.158945
A,-0.223707,-0.012527,-0.767089,-0.372358


In [102]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
B,0.158945,0.071047,-0.755771,-0.315321
A,-0.372358,-0.767089,-0.012527,-0.223707


In [103]:
obj = Series([1,3,4,2])

# 对Series的值排序
obj.sort_values()

0    1
3    2
1    3
2    4
dtype: int64

In [104]:
frame = DataFrame({'b': [3,2,2,1], 'a': [0,1,0,1]})

# 对DataFrame的一个列中的值进行排序
frame.sort_values(by='b')

Unnamed: 0,a,b
3,1,1
1,1,2
2,0,2
0,0,3


In [105]:
# 对DataFrame的多个列中的值进行排序
frame.sort_values(by=['b','a'])

Unnamed: 0,a,b
3,1,1
2,0,2
1,1,2
0,0,3


In [106]:
obj = Series([7,-5,7,4,2,0,4])

# rank 是通过“为各组分配一个平均排名”的方式破坏平级关系 ??
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

### Axis indexes with duplicate values

In [107]:
obj = Series(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [108]:
obj.index.is_unique

False

In [109]:
obj['a']

a    0
a    1
dtype: int64

In [110]:
obj['c']

4

In [111]:
df = DataFrame(np.random.randn(4,3), index=list('aabb'))
df.ix['b']

Unnamed: 0,0,1,2
b,0.39563,1.169514,-1.121483
b,-0.529602,0.478716,-0.452029


## Summarizing and Computing Descriptive Statistics

In [112]:
df = DataFrame(np.random.randn(4,5))
df.ix[1:2,1:2] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,-0.315221,0.498536,0.221192,-0.527254,0.781073
1,-0.471472,,,-2.642423,0.180235
2,-0.924587,,,-0.267094,-1.543702
3,-0.532153,0.467926,-0.151014,-0.21138,-0.101802


In [113]:
df.sum()

0   -2.243433
1    0.966462
2    0.070177
3   -3.648151
4   -0.684195
dtype: float64

In [114]:
# NA只会自动排除，除非整个切片都是NA
df.sum(axis=1)

0    0.658325
1   -2.933660
2   -2.735383
3   -0.528422
dtype: float64

In [115]:
# 通过skipna选项可以禁用该功能
df.sum(axis=1, skipna=False)

0    0.658325
1         NaN
2         NaN
3   -0.528422
dtype: float64

In [116]:
# 获得最大值的索引值
df.idxmax()

0    0
1    0
2    0
3    3
4    0
dtype: int64

In [117]:
# 样本值的累计和
df.cumsum()

Unnamed: 0,0,1,2,3,4
0,-0.315221,0.498536,0.221192,-0.527254,0.781073
1,-0.786694,,,-3.169677,0.961308
2,-1.711281,,,-3.436772,-0.582394
3,-2.243433,0.966462,0.070177,-3.648151,-0.684195


In [118]:
# 针对Series或各DataFrame列计算汇总统计
df.describe()

Unnamed: 0,0,1,2,3,4
count,4.0,2.0,2.0,4.0,4.0
mean,-0.560858,0.483231,0.035089,-0.912038,-0.171049
std,0.259133,0.021644,0.263189,1.161775,0.986392
min,-0.924587,0.467926,-0.151014,-2.642423,-1.543702
25%,-0.630261,0.475578,-0.057963,-1.056047,-0.462277
50%,-0.501812,0.483231,0.035089,-0.397174,0.039217
75%,-0.432409,0.490883,0.12814,-0.253166,0.330445
max,-0.315221,0.498536,0.221192,-0.21138,0.781073


### Correlation and Covariance

In [122]:
import pandas_datareader.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'] for tic, data, in all_data.iteritems()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})

In [125]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-12-24,0.034339,0.011117,0.004385,0.002587
2009-12-28,0.012294,0.007098,0.013326,0.005484
2009-12-29,-0.011861,-0.005571,-0.003477,0.007058
2009-12-30,0.012147,0.005376,0.005461,-0.013699
2009-12-31,-0.0043,-0.004416,-0.012597,-0.015504


In [127]:
returns.MSFT.corr(returns.IBM)

0.49597963862836758

In [129]:
# corr 将以DataFrame形式返回完整的相关系数
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.470676,0.410011,0.424305
GOOG,0.470676,1.0,0.390689,0.443587
IBM,0.410011,0.390689,1.0,0.49598
MSFT,0.424305,0.443587,0.49598,1.0


In [130]:
# cov 将以DataFrame形式返回完整的协方差矩阵
# 协方差：一种用来度量两个随机变量关系的统计量
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.001027,0.000303,0.000252,0.000309
GOOG,0.000303,0.00058,0.000142,0.000205
IBM,0.000252,0.000142,0.000367,0.000216
MSFT,0.000309,0.000205,0.000216,0.000516


In [132]:
# 利用corrwith，可以计算其列或行跟另一个Series或DataFrame之间的相关系数
# 传入一个Series会返回一个相关系数值Series(针对各列进行计算)
returns.corrwith(returns.IBM)

AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64

In [134]:
# 传入一个DataFrame会计算按列名配对的相关系数
# 范例：计算百分比变化与成交量的相关系数
returns.corrwith(volume)

AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64

### Unique Values, Value Counts, and Membership

In [143]:
obj = Series(['c','a','d','a','a','b','b','c','c'])

In [144]:
# 取得Series中唯一值数组
unique = obj.unique()
unique

array(['c', 'a', 'd', 'b'], dtype=object)

In [145]:
# 排序
unique.sort()
unique

array(['a', 'b', 'c', 'd'], dtype=object)

In [146]:
# 计算个值出现的频率
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [147]:
# 排序(生幂)
pd.value_counts(obj.values, sort=True, ascending=True)

d    1
b    2
a    3
c    3
dtype: int64

In [148]:
# isin 判断矢量化集合的成员资格
mask = obj.isin(['b','c'])
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [150]:
data = DataFrame({'Q1': [1,2,3,3,4],
                  'Q2': [2,3,1,2,3],
                  'Q3': [1,5,2,4,4]})
data

Unnamed: 0,Q1,Q2,Q3
0,1,2,1
1,2,3,5
2,3,1,2
3,3,2,4
4,4,3,4


In [152]:
# 将 pandas.value_counts 传给 DataFrame apply 函数，计算每个 column 各值出现的频率
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Q1,Q2,Q3
1,1.0,1.0,1.0
2,1.0,2.0,1.0
3,2.0,2.0,0.0
4,1.0,0.0,2.0
5,0.0,0.0,1.0


## Handling Missing Data

In [154]:
string_data = Series(['aaa','bbb',np.nan,'ddd'])
string_data

0    aaa
1    bbb
2    NaN
3    ddd
dtype: object

In [155]:
# pandas 使用浮点值 NAN 表示浮点和非浮点数组中的缺失值
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [156]:
# Python 内置的 None 也会被当做NAN处理
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

NA 处理方式

方法 | 说明
---|---
`dropna` | 根据各标签的值是否存在缺失数据对轴标签进行过滤
`fillna` | 用指定值或插值方法（ffill, bfill)填充缺失数据
`isnull` | 返回一个含有布尔值的对象，表示那些值是缺失值
`notnull` | isnull的否定式

### Filtering Out Missing Data

In [157]:
from numpy import nan as NA

In [158]:
data = Series([1, NA, 3.5, NA, 7])

# 返回一个仅含非空数据和索引值的Series
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [160]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [164]:
data = DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [162]:
# dropna 默认丢弃任何含有缺失值的 row
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [163]:
# 只丢弃全为NA的 row
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [167]:
# 只丢弃两个NA以上（含）的 row
data.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
3,,6.5,3.0


In [168]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [169]:
# 丢弃为NA的 column
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### Filling in Missing Data

In [171]:
# 使用 fillna 替换缺失值
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [174]:
# 通过字典调用 fillna，对不同 column 填充不同的值
data.fillna({1:1, 2:2})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,1.0,2.0,
2,,1.0,2.0,
3,,6.5,3.0,


In [175]:
# 使用 inplace=True 对对象进行修改
data.fillna(0, inplace=True)
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [176]:
df = DataFrame(np.random.randn(6,3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.096822,-0.887916,-0.091528
1,-0.497754,-0.016213,0.912858
2,0.66883,,-0.515101
3,-1.266811,,-1.552573
4,0.830767,,
5,-0.839521,,


In [177]:
# 使用 ffill 插值
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.096822,-0.887916,-0.091528
1,-0.497754,-0.016213,0.912858
2,0.66883,-0.016213,-0.515101
3,-1.266811,-0.016213,-1.552573
4,0.830767,-0.016213,-1.552573
5,-0.839521,-0.016213,-1.552573


In [178]:
# 使用平均数做插值
data = Series([1, NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

ffill 函数的参数

参数 | 说明
---|---
`value` | 用于填充缺失值的标量值或字典对象
`method` | 插值方式，如果函数调用时未指定其他参数的话，默认为 ffill
`axis` | 带填充的轴，默认 axis=0
`inplace` | 修改调用者对象而不产生副本
`limit` | 可连续填充的最大数量

## Hierarchical Indexing

In [180]:
data = Series(np.random.randn(10),
             index=[list('aaabbbccdd'),[1,2,3,1,2,3,1,2,2,3]])
data

a  1    0.345037
   2   -0.976427
   3    1.133908
b  1    1.725242
   2    1.257853
   3    2.744078
c  1   -0.446399
   2   -0.033313
d  2    1.276558
   3    0.375159
dtype: float64

In [181]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [182]:
data['b']

1    1.725242
2    1.257853
3    2.744078
dtype: float64

In [183]:
data['b':'c']

b  1    1.725242
   2    1.257853
   3    2.744078
c  1   -0.446399
   2   -0.033313
dtype: float64

In [184]:
data.ix[['b','c']]

b  1    1.725242
   2    1.257853
   3    2.744078
c  1   -0.446399
   2   -0.033313
dtype: float64

In [188]:
# 在内层选取
data[:, 2]

a   -0.976427
b    1.257853
c   -0.033313
d    1.276558
dtype: float64

In [187]:
data.ix[:, 2]

a   -0.976427
b    1.257853
c   -0.033313
d    1.276558
dtype: float64

In [189]:
# 通过 unstack 重新安排，变成新的 DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,0.345037,-0.976427,1.133908
b,1.725242,1.257853,2.744078
c,-0.446399,-0.033313,
d,,1.276558,0.375159


In [199]:
# DataFrame 分层索引
frame = DataFrame(np.arange(12).reshape((4,3)),
                 index=[['a','a','b','b'], [1,2,1,2]],
                 columns=[['Ohio','Ohio','Colorado'], ['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [202]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [203]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [207]:
frame.ix['a']

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


### Reordering and Sorting Levels

In [208]:
# swaplevel 接受两个级别编号或名称，返回一个互换级别的新对象
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [215]:
# 排序
frame.swaplevel(0,1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [220]:
# 互换 column's level，并排序
frame.swaplevel(0,1,axis=1).sortlevel(0,axis=1)

Unnamed: 0_level_0,color,Green,Green,Red
Unnamed: 0_level_1,state,Colorado,Ohio,Ohio
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


### Summary Statistics by Level

In [221]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [222]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### Using a DataFrame’s Columns

In [223]:
frame = DataFrame({
    'a': range(7),
    'b': range(7,0,-1),
    'c': ['one','one','one','two','two','two','two'],
    'd': [0,1,2,0,1,2,3]
})

frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [224]:
# set_index 将一个或多个 column 转换成 row index
frame2 = frame.set_index(['c','d'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [226]:
# 保留 column
frame.set_index(['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [227]:
# reset_index 功能和 set_index 相反，层次化索引的级别会被转移到 column 里面
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## Other pandas Topics
### Integer Indexing

In [241]:
ser = Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int64

In [242]:
# 虽然一个含有 0,1,2 的索引，的是很难推断用户想要什么
# ser[-1]
# ser.ix[-1]

In [245]:
ser2 = Series(np.arange(3), index=['a','b','c'])
ser

0    0
1    1
2    2
dtype: int64

In [246]:
# 对于一个非整数索引，就没有歧义
ser2[-1]

2

In [247]:
ser2.ix[:1]

a    0
dtype: int64

In [263]:
# 使用 iloc 定位取值
ser3 = Series(range(3), index=[-5, 1, 3])
ser3.iloc[2]

2

In [264]:
# 使用 iloc 定位取值
frame = DataFrame(np.arange(6).reshape((3,2)), index=[2,0,1])
frame.iloc[2]

0    4
1    5
Name: 1, dtype: int64

### Panel Data