In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import numpy as np
from numpy.random import randn

In [2]:
###pandas的数据结构介绍
##Series
obj=Series([4,7,-5,3])

In [3]:
obj


0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
#创建一个可自行定义索引的Series
obj2=Series([4,7,-5,3],index=['d','b','a','c'])

In [7]:
obj2


d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2.index

Index([u'd', u'b', u'a', u'c'], dtype='object')

In [9]:
obj2['a']

-5

In [10]:
obj2['d']=6

In [11]:
obj2[['c','a','d']]


c    3
a   -5
d    6
dtype: int64

In [12]:
#所有对ndarray的运算，都适用于Series，且都会保留index和value之间的关系
obj2[obj2>0]


d    6
b    7
c    3
dtype: int64

In [13]:
obj2*2


d    12
b    14
a   -10
c     6
dtype: int64

In [16]:
np.exp(obj2)


d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [17]:
'b' in obj2

True

In [18]:
'e' in obj2

False

In [19]:
#如果数据被存放在一个python字典中，可以直接通过字典来创建Series
sdata={'ohio':35000, 'texas':71000, 'oregon':16000, 'utah':5000}

In [20]:
obj3=Series(sdata)

In [21]:
obj3


ohio      35000
oregon    16000
texas     71000
utah       5000
dtype: int64

In [22]:
#若是指定Series的index为一个列表，value为一个字典，则Series会自动根据列表中内容匹配字典里的数值
states=['california','ohio','oregon','texas']

In [23]:
obj4=Series(sdata, index=states)

In [24]:
obj4


california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
dtype: float64

In [25]:
#pd.isnull和.notnull可用于检测缺失数据
pd.isnull(obj4)


california     True
ohio          False
oregon        False
texas         False
dtype: bool

In [26]:
pd.notnull(obj4)


california    False
ohio           True
oregon         True
texas          True
dtype: bool

In [27]:
obj4.isnull()


california     True
ohio          False
oregon        False
texas         False
dtype: bool

In [28]:
#Series会在计算中自动对齐索引后再进行计算
obj3+obj4


california         NaN
ohio           70000.0
oregon         32000.0
texas         142000.0
utah               NaN
dtype: float64

In [29]:
#Series本身和index都有一个name属性
obj4.name='population'

In [30]:
obj4.index.name='state'

In [31]:
obj4


state
california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
Name: population, dtype: float64

In [32]:
#Series的index可以通过赋值的方式直接修改
obj.index=['bob','steve','jeff','ryan']

In [33]:
obj


bob      4
steve    7
jeff    -5
ryan     3
dtype: int64

In [34]:
##DataFrame
#是一个表格型的数据结构，既有行索引，也有列索引，可视为用Series组成的字典
#每列可以是不同的数据类型，有一个有序的列
#DataFrame构建方法1：使用字典构建的ndarray数组
data={'state':['ohio','ohio','ohio','nevada','nevada'],
     'year':['2000','2001','2002','2001','2002'],
     'pop':[1.5,1.7,3.6,2.4,2.9]}

In [35]:
frame=DataFrame(data)

In [36]:
frame

Unnamed: 0,pop,state,year
0,1.5,ohio,2000
1,1.7,ohio,2001
2,3.6,ohio,2002
3,2.4,nevada,2001
4,2.9,nevada,2002


In [37]:
#如果指定了列名的序列，DataFrame会按照指定顺序排列
DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,ohio,1.5
1,2001,ohio,1.7
2,2002,ohio,3.6
3,2001,nevada,2.4
4,2002,nevada,2.9


In [39]:
#如果传入的列在数据中找不到，则产生NA值
frame2=DataFrame(data, columns=['year','state','pop','debt'],
                index=['one','two','three','four','five'])

In [40]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,
three,2002,ohio,3.6,
four,2001,nevada,2.4,
five,2002,nevada,2.9,


In [41]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [42]:
frame2.index

Index([u'one', u'two', u'three', u'four', u'five'], dtype='object')

In [43]:
#通过字典标记或属性的方式，可以将DataFrame的列获取成为一个Series
frame2['state']


one        ohio
two        ohio
three      ohio
four     nevada
five     nevada
Name: state, dtype: object

In [44]:
frame2.year


one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: object

In [45]:
#行也可以通过位置或名称的方式进行获取，形成一个Series，如使用ix属性（索引字段）
frame2.ix['three']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  



year     2002
state    ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [46]:
#.ix已经不再被Python支持，可以使用.loc对索引进行查询
frame2.loc['three']


year     2002
state    ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [85]:
frame2.iloc[3]
#iloc接受使用数字代替索引值进行查询


ohio          6
texas         7
california    8
Name: d, dtype: int32

In [49]:
#列中的空值可以通过赋值进行修改
frame2['debt']=16.5

In [50]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,16.5
two,2001,ohio,1.7,16.5
three,2002,ohio,3.6,16.5
four,2001,nevada,2.4,16.5
five,2002,nevada,2.9,16.5


In [51]:
frame2['debt']=np.arange(5.0)

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,0.0
two,2001,ohio,1.7,1.0
three,2002,ohio,3.6,2.0
four,2001,nevada,2.4,3.0
five,2002,nevada,2.9,4.0


In [53]:
#将列表或数组赋值给某个列时，其长度必须跟DataFrame的长度相匹配；
#如果赋值的是一个Series，就会精确匹配DataFrame的索引，所有空位都会被天上缺失值
val=Series([-1.2,-1.5,-1.7],index=['two','four','five'])

In [54]:
frame2['debt']=val

In [55]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,
two,2001,ohio,1.7,-1.2
three,2002,ohio,3.6,
four,2001,nevada,2.4,-1.5
five,2002,nevada,2.9,-1.7


In [56]:
#对不存在的列赋值会创建一个新的列
frame2['eastern']=frame2.state=='ohio'

In [57]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,ohio,1.5,,True
two,2001,ohio,1.7,-1.2,True
three,2002,ohio,3.6,,True
four,2001,nevada,2.4,-1.5,False
five,2002,nevada,2.9,-1.7,False


In [58]:
#关键字del用于删除列
del frame2['eastern']

In [59]:
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

In [60]:
#注意：对从DataFrame中返回的任何Series都是源数据的视图，对齐进行任何修改都会反映到源数据上

In [10]:
#DataFrame构建方式2：嵌套字典
#外层字典的key被作为列索引，内层key作为行索引，并且内层key会进行合并
pop={'nevada':{2001:2.4,2002:2.9},
     'ohio':{2000:1.5,2001:1.7,2002:3.6}}

In [11]:
frame3=DataFrame(pop)

In [63]:
frame3

Unnamed: 0,nevada,ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [65]:
#可以对输出的DataFrame制定索引，从而筛选从嵌套字典中提取的值
DataFrame(pop, index=[2001,2002,2003])

Unnamed: 0,nevada,ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [66]:
#DataFrame中的列为columns，行为index，其中值为values
#对DataFrame进行转置，则行和列索引会互换
frame3.T

Unnamed: 0,2000,2001,2002
nevada,,2.4,2.9
ohio,1.5,1.7,3.6


In [67]:
#DataFrame构建方法3：由Series组成的字典
pdata={'ohio':frame3['ohio'][:-1],
      'nevada':frame3['nevada'][:2]}

In [68]:
DataFrame(pdata)

Unnamed: 0,nevada,ohio
2000,,1.5
2001,2.4,1.7


In [12]:
#如果设置了DataFrame的index和columns的name属性，则这些信息也会显示出来
frame3.index.name='year';frame3.columns.name='state'

In [13]:
frame3

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [72]:
#DataFrame中的values会以二维数组ndarray的形式返回
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [73]:
frame2.values

array([['2000', 'ohio', 1.5, nan],
       ['2001', 'ohio', 1.7, -1.2],
       ['2002', 'ohio', 3.6, nan],
       ['2001', 'nevada', 2.4, -1.5],
       ['2002', 'nevada', 2.9, -1.7]], dtype=object)

In [74]:
##索引对象
#！！！pandas中的索引一旦被定义，就不可更改
#这样才能使得index对象在多个数据结构之间安全共享
obj=Series(range(3),index=['a','b','c'])

In [75]:
indexs=obj.index

In [76]:
indexs

Index([u'a', u'b', u'c'], dtype='object')

In [78]:
indexs[1:]

Index([u'b', u'c'], dtype='object')

In [79]:
indexs[1]='d'
#index不可修改

TypeError: Index does not support mutable operations

In [7]:
indexs=pd.Index(np.arange(3))
#pd中的Index，I必须大写

In [8]:
obj2=Series([1.5,-2.5,0],index=indexs)

In [9]:
obj2.index is indexs

True

In [14]:
#Index的功能类似一个固定大小的集合
frame3

state,nevada,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [15]:
'ohio' in frame3.columns

True

In [16]:
2003 in frame3.index

False

In [17]:
#index的方法和属性列表，详见onenote笔记

In [18]:
###基本功能
##重新索引
#.reindex是创建一个适应新索引的新对象，而不是直接在源对象上修改
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])

In [19]:
obj


d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [21]:
#通过调用.reindex，为新对象生成重新排列的索引，若索引值不存在，则显示为null
obj2=obj.reindex(['a','b','c','d','e'])

In [22]:
obj2


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [23]:
obj.reindex(['a','b','c','d','e'],fill_value=0)


a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [24]:
#使用reindex中的method选项，可以对索引值缺失的部分进行填充
#适合时间序列中的插值处理
obj3=Series(['blue','purple','yellow'],index=[0,2,4])

In [25]:
obj3.reindex(range(6),method='ffill')
#method选项：
#ffill：正序填充
#bfill：逆向填充


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [26]:
#对于DataFrame而言，reindex可以修改行索引、列索引或两个都改
#如果之传入一个序列，则默认修改行索引
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],
                columns=['ohio','texas','california'])

In [27]:
frame

Unnamed: 0,ohio,texas,california
a,0,1,2
c,3,4,5
d,6,7,8


In [28]:
frame2=frame.reindex(['a','b','c','d'])

In [29]:
frame2

Unnamed: 0,ohio,texas,california
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [30]:
#使用columns关键字可以重新重新索引列
states=['texas','utah','california']

In [31]:
frame.reindex(columns=states)

Unnamed: 0,texas,utah,california
a,1,,2
c,4,,5
d,7,,8


In [36]:
#可以同时对行和列进行重新索引，而插值只能按行应用
frame.reindex(index=['a','b','c','d'], method='ffill')

Unnamed: 0,ohio,texas,california
a,0,1,2
b,0,1,2
c,3,4,5
d,6,7,8


In [37]:
frame.reindex(columns=states)

Unnamed: 0,texas,utah,california
a,1,,2
c,4,,5
d,7,,8


In [40]:
#同时修改行和列索引好像并不支持，pandas推荐行或列索引的单独使用，避免数据重复
frame.reindex(index=['a','b','c','d'], columns=states,method='ffill')

ValueError: index must be monotonic increasing or decreasing

In [41]:
frame2=frame.reindex(index=['a','b','c','d'], method='ffill')

In [42]:
frame2.reindex(columns=states)
#通过复制一下中间过程，将不能同时执行的一个步骤，拆成两个步骤就可以得到想要的结果

Unnamed: 0,texas,utah,california
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [39]:
#使用.loc()函数功能，可以对行和列索引进行同时修改，但是不能进行缺失值填充
frame.loc[['a','b','c','d'],states]

Unnamed: 0,texas,utah,california
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [43]:
#reindex属性的参数说明，详见onenote笔记

In [44]:
##丢弃指定轴上的项，使用.drop()函数
obj=Series(np.arange(5.0),index=['a','b','c','d','e'])

In [45]:
new_obj=obj.drop('c')

In [46]:
new_obj


a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [47]:
obj.drop(['d','c'])


a    0.0
b    1.0
e    4.0
dtype: float64

In [48]:
#对于DataFrame可以删除任意轴上的某个索引值
data=DataFrame(np.arange(16).reshape((4,4)),
              index=['ohio','colorado','utah','new york'],
              columns=['one','two','three','four'])

In [49]:
data.drop(['colorado','ohio'])
#默认是丢弃行索引上的值

Unnamed: 0,one,two,three,four
utah,8,9,10,11
new york,12,13,14,15


In [50]:
#若是要丢弃列索引上的值，需要指定axis=1，否则会出错
data.drop('two',axis=1)

Unnamed: 0,one,three,four
ohio,0,2,3
colorado,4,6,7
utah,8,10,11
new york,12,14,15


In [51]:
data.drop(['two','four'])

ValueError: labels ['two' 'four'] not contained in axis

In [52]:
##索引、选取和过滤
#Series和DataFrame的索引很像ndarray数组的索引，只是他们的索引不再只有数字
obj=Series(np.arange(4.0),index=['a','b','c','d'])

In [53]:
obj['b']

1.0

In [54]:
obj[1]

1.0

In [55]:
obj[2:4]


c    2.0
d    3.0
dtype: float64

In [56]:
obj[['b','a','d']]


b    1.0
a    0.0
d    3.0
dtype: float64

In [57]:
obj[[1,3]]


b    1.0
d    3.0
dtype: float64

In [58]:
obj[obj<2]


a    0.0
b    1.0
dtype: float64

In [59]:
#注意：利用标签进行切片运算，与数字切片不同，标签的末端被包含在内，即全闭区间
obj['b':'c']


b    1.0
c    2.0
dtype: float64

In [60]:
obj['b':'c']=5

In [61]:
obj


a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [62]:
#DataFrame上的索引更加灵活
data=DataFrame(np.arange(16).reshape((4,4)),
              index=['ohio','colorado','utah','new york'],
              columns=['one','two','three','four'])

In [63]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
utah,8,9,10,11
new york,12,13,14,15


In [64]:
data['two']


ohio         1
colorado     5
utah         9
new york    13
Name: two, dtype: int32

In [65]:
data[['three','one']]

Unnamed: 0,three,one
ohio,2,0
colorado,6,4
utah,10,8
new york,14,12


In [66]:
#因此可以使用切片或布尔型数组对DataFrame选取行、行索引，即index的选取
data[:2]
#dataframe不能像ndarray一样，使用[:2,1:]来选择表中的区域，只能选择轴0

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7


In [74]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
utah,8,9,10,11
new york,12,13,14,15


In [75]:
data<5

Unnamed: 0,one,two,three,four
ohio,True,True,True,True
colorado,True,False,False,False
utah,False,False,False,False
new york,False,False,False,False


In [76]:
data[data<5]=0

In [77]:
data

Unnamed: 0,one,two,three,four
ohio,0,0,0,0
colorado,0,5,6,7
utah,8,9,10,11
new york,12,13,14,15


In [78]:
#使用.loc对DataFrame的列索引进行选择
data.loc['colorado',['two','three']]


two      5
three    6
Name: colorado, dtype: int32

In [81]:
data.ix[['colorado','utah'],[3,0,1]]
#.ix可以混合使用索引值或数字对DataFrame进行选择，但现在已经不推荐使用

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
colorado,7,0,5
utah,11,8,9


In [83]:
#loc用于指定名称的index和columns选择
#iloc用于使用数字代替index和columns的值，对DataFrame进行选择
#二者均支持布尔型索引值的使用
data.iloc[:,3]


ohio         0
colorado     7
utah        11
new york    15
Name: four, dtype: int32

In [84]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,four,one,two
colorado,7,0,5
utah,11,8,9


In [86]:
data.loc[:'utah','two']


ohio        0
colorado    5
utah        9
Name: two, dtype: int32

In [89]:
data.iloc[data.three>5,:3]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [88]:
data.loc[data.three>5,:'three']

Unnamed: 0,one,two,three
colorado,0,5,6
utah,8,9,10
new york,12,13,14


In [90]:
##算术运算和数据对齐
#当两个对象相加时，会将索引对齐后再相加，其他运算也是一样
#如果存在不同索引值，则不相同的索引被计算为NaN，即null
s1=Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])

In [91]:
s2=Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])

In [92]:
s1


a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [93]:
s2


a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [94]:
s1+s2


a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [95]:
#缺失值会在算术运算过程中传播
#对于DataFrame而言，对齐操作会同时发生在行和列上
df1=DataFrame(np.arange(9).reshape((3,3)),columns=list('bcd'),
              index=['ohio','texas','colorado'])

In [96]:
df2=DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'),
             index=['utah','ohio','texas','oregon'])

In [97]:
df1

Unnamed: 0,b,c,d
ohio,0,1,2
texas,3,4,5
colorado,6,7,8


In [98]:
df2

Unnamed: 0,b,d,e
utah,0,1,2
ohio,3,4,5
texas,6,7,8
oregon,9,10,11


In [99]:
#二者相加后，返回的DataFrame包含的行和列，是原来两个DataFrame的并集：
df1+df2

Unnamed: 0,b,c,d,e
colorado,,,,
ohio,3.0,,6.0,
oregon,,,,
texas,9.0,,12.0,
utah,,,,


In [106]:
##在算数方法中填充值
df3=DataFrame(np.arange(12.0).reshape((3,4)),columns=list('abcd'))

In [107]:
df4=DataFrame(np.arange(20.0).reshape((4,5)),columns=list('abcde'))

In [108]:
df3

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [109]:
df4

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [110]:
df3+df4

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [111]:
#使用df3的add函数，传入df4和一个fill_value参数
df3.add(df4, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [112]:
df4.add(df3, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [113]:
#即使两个对象调换位置，使用函数得到的结果也是一样的
#所以说，fill_value是将两个对象的索引补充完整（缺失填写0），之后再对二者相加
#跟多的算术方法还有：add加法，sub减法，div除法，mul乘法
df3.mul(df4,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,4.0,9.0,0.0
1,20.0,30.0,42.0,56.0,0.0
2,80.0,99.0,120.0,143.0,0.0
3,0.0,0.0,0.0,0.0,0.0


In [114]:
#注意：这些计算只是元素级的计算，不是矩阵运算

In [115]:
##DataFrame和Series之间的运算
#类似二位数组和一维数组间的的运算
arr=np.arange(12.).reshape((3,4))

In [116]:
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [117]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [118]:
arr-arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [119]:
#这种广播的计算效果，DataFrame和Series之间的运算也是类似的
fra=DataFrame(np.arange(12.0).reshape((4,3)), columns=list('bde'),
             index=['utah','ohio','texas','oregon'])

In [120]:
ser=fra.iloc[0]

In [121]:
fra

Unnamed: 0,b,d,e
utah,0.0,1.0,2.0
ohio,3.0,4.0,5.0
texas,6.0,7.0,8.0
oregon,9.0,10.0,11.0


In [122]:
ser


b    0.0
d    1.0
e    2.0
Name: utah, dtype: float64

In [123]:
#默认情况下，会将Series的索引匹配到DataFrame的列上，然后沿着行一直向下广播
fra-ser

Unnamed: 0,b,d,e
utah,0.0,0.0,0.0
ohio,3.0,3.0,3.0
texas,6.0,6.0,6.0
oregon,9.0,9.0,9.0


In [125]:
#如果某个索引值在DataFrame的列或Series的索引中找不到，
#则ataFrame的列索引就和eries的索引形成并集，之后再进行计算
ser2=Series(range(3), index=list('bef'))

In [126]:
fra+ser2

Unnamed: 0,b,d,e,f
utah,0.0,,3.0,
ohio,3.0,,6.0,
texas,6.0,,9.0,
oregon,9.0,,12.0,


In [127]:
#如果希望匹配DataFrame的行，并在列上推广，则必须使用算术运算关键字
ser3=fra['d']

In [128]:
ser3


utah       1.0
ohio       4.0
texas      7.0
oregon    10.0
Name: d, dtype: float64

In [130]:
fra.sub(ser3, axis=0)

Unnamed: 0,b,d,e
utah,-1.0,0.0,1.0
ohio,-1.0,0.0,1.0
texas,-1.0,0.0,1.0
oregon,-1.0,0.0,1.0


In [7]:
##函数应用和映射
#np的ufunc也可用于操作pd对象，也同样是元素级运算
#注释：我这里用dfr替代了书中的frame
dfr=DataFrame(np.random.randn(4,3), columns=list('bde'),
             index=['utah','ohio','texas','oregon'])

In [133]:
dfr

Unnamed: 0,b,d,e
utah,-0.09228,-0.494399,0.564879
ohio,-0.566703,-0.971769,1.466164
texas,1.683495,-1.273882,-1.492663
oregon,-0.188113,-0.834062,-0.373611


In [134]:
np.abs(dfr)

Unnamed: 0,b,d,e
utah,0.09228,0.494399,0.564879
ohio,0.566703,0.971769,1.466164
texas,1.683495,1.273882,1.492663
oregon,0.188113,0.834062,0.373611


In [136]:
#一个常见操作是，将函数应用到由各列或行所形成的的一维数组上
#使用DataFrame的apply函数
#先定义f函数
f=lambda x: x.max() - x.min()

In [137]:
f

<function __main__.<lambda>>

In [5]:
dfr2=DataFrame(np.arange(12.0).reshape((4,3)), columns=list('bde'),
             index=['utah','ohio','texas','oregon'])

In [139]:
dfr2

Unnamed: 0,b,d,e
utah,0.0,1.0,2.0
ohio,3.0,4.0,5.0
texas,6.0,7.0,8.0
oregon,9.0,10.0,11.0


In [140]:
#将f函数应用到dfr2的行上
dfr2.apply(f)


b    9.0
d    9.0
e    9.0
dtype: float64

In [141]:
#将f函数应用大到dfr2的列上
dfr2.apply(f, axis=1)


utah      2.0
ohio      2.0
texas     2.0
oregon    2.0
dtype: float64

In [142]:
#除标量值外，还可以将Series传递给apply使其广播到各行各列上
def f(x):
    return Series([x.min(), x.max()], index=['min','max'])

In [143]:
dfr2.apply(f)

Unnamed: 0,b,d,e
min,0.0,1.0,2.0
max,9.0,10.0,11.0


In [3]:
#使用元素级pythong函数
#得到dfr中各个浮点值的格式化字符串，使用applymap——即按固定格式输出结果
format=lambda x: '%.2f' % x

In [10]:
dfr

Unnamed: 0,b,d,e
utah,1.577524,-0.27684,-1.128216
ohio,0.156235,1.776612,0.18871
texas,1.075503,-0.577456,0.253957
oregon,0.471491,-0.42654,0.052417


In [8]:
dfr.applymap(format)

Unnamed: 0,b,d,e
utah,1.58,-0.28,-1.13
ohio,0.16,1.78,0.19
texas,1.08,-0.58,0.25
oregon,0.47,-0.43,0.05


In [9]:
#同理，Series有一个应用于元素级函数的map方法
dfr['e'].map(format)


utah      -1.13
ohio       0.19
texas      0.25
oregon     0.05
Name: e, dtype: object

In [11]:
##排序和排名
#对行或列索引进行排序，使用sort_index方法，返回一个已排序的新对象
obj=Series(range(4), index=['d','a','b','c'])

In [12]:
obj.sort_index()


a    1
b    2
c    3
d    0
dtype: int64

In [13]:
#对于DataFrame，可以根据任意轴上的索引进行排序
frame=DataFrame(np.arange(8).reshape((2,4)), index=['three','one'],
               columns=['d','a','b','c'])

In [14]:
frame.sort_index()
#默认针对轴0进行排序

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [15]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [16]:
#若想让结果按降序排列，则需要改变ascending参数的属性为False
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [17]:
#若要对元素值进行排序，则使用order方法
obj=Series([4,7,-3,2])

In [18]:
obj.order()
#order已经被停用，使用sort_values对元素进行排序

AttributeError: 'Series' object has no attribute 'order'

In [19]:
obj.sort_values()
#对于Series而言，只有一列，sort_values不需要设置任何属性可以直接使用


2   -3
3    2
0    4
1    7
dtype: int64

In [43]:
#排序时，任何缺失值都会被放到Series的末尾
obj=Series([4, np.nan, 7, np.nan, -3, 2])

In [44]:
obj.sort_values()


4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [40]:
#对于DataFrame，使用sort_values时需要指定排序的行或列名称，by属性不能省略
#其他属性和sort_index类似
frame.sort_values(by='three',axis=1)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [42]:
frame.sort_values(by='c',ascending=False)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [45]:
#在DataFrame中，若要使用多个列进行排序，则直接将过个列名传递给sort_values的by属性
#sort_index的by属性已经被停止使用
frame=DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})

In [48]:
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [49]:
#sort_index对标签进行排序
#sort_values对元素值进行排序

In [50]:
#rank()方法:给定源数据的排序顺序排名，
#对于值相等的元素，rank给各元素都分配一个平均排名
obj=Series([7,-5,7,4,2,0,4])

In [51]:
obj.rank()


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [52]:
#rank有method属性：
#average表示根据元素大小排序后，给出各元素出现的顺序；
#first表示根据序列中元素在源数据中出现的顺序，给出出现位置排序；
obj.rank(method='first')


0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [53]:
#也可以按照降序进行排名
#rank函数的具体用法详见pandas网上的说明文档
obj.rank(method='max', ascending=False)


0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [54]:
#DataFrame可以再行或列上计算排名
frame=DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],
                'c':[-2,5,8,-2.5]})

In [55]:
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [56]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [57]:
##带有重复值的轴索引
#并不是所有pd对象的索引都满足标签唯一性，很多索引值会有重复的情况
obj=Series(range(5),index=['a','a','b','b','c'])

In [58]:
obj


a    0
a    1
b    2
b    3
c    4
dtype: int64

In [59]:
#使用索引的is_unique属性，可以帮助我们判断索引值是否唯一
obj.is_unique

True

In [60]:
obj.index.is_unique

False

In [61]:
#当索引值不唯一时，数据选取会返回多个值
obj['a']


a    0
a    1
dtype: int64

In [62]:
obj['c']

4

In [63]:
#对DataFrame也是一样
df=DataFrame(np.random.randn(4,3), index=['a','a','b','b'])

In [64]:
df

Unnamed: 0,0,1,2
a,0.466094,-1.396969,-0.80921
a,1.445572,-1.004366,0.926826
b,-0.674764,-0.139745,-0.053678
b,0.539933,0.712559,-0.144517


In [65]:
df.loc['b']

Unnamed: 0,0,1,2
b,-0.674764,-0.139745,-0.053678
b,0.539933,0.712559,-0.144517


In [66]:
###汇总和计算描述性统计
df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
            index=['a','b','c','d'], columns=['one','two'])

In [67]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [68]:
#使用sum()计算汇总，默认基于轴0即列汇总
df.sum()


one    9.25
two   -5.80
dtype: float64

In [69]:
df.sum(axis=1)
#空值被自动排除，除非整行或整列都是null


a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [70]:
#通过skipna选项可以禁用跳过空值功能,这时候包含空值的行或列则不会计算结果
df.mean(axis=1,skipna=False)


a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [71]:
#间接统计
#idxmax：返回达到最大值的索引
#idxmin：返回达到最小值的索引
df.idxmax()


one    b
two    d
dtype: object

In [72]:
#累计计算cumsum()，会避开空值
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [73]:
#describe可以产生基于数字或字符串的多个汇总统计
#describe只能计算列的统计量，即不能沿着轴1进行横向统计
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [74]:
df.describe(axis=1)

TypeError: describe() got an unexpected keyword argument 'axis'

In [75]:
obj=Series(['a','a','b','c']*4)

In [76]:
obj.describe()


count     16
unique     3
top        a
freq       8
dtype: object

In [77]:
#更多的描述性统计函数，详见onenote笔记

In [78]:
##相关系数和协方差
#看下来自Yahoo！finance的股票价格和成交量
import pandas.io.data as web
#pandas.io库已经更换为pandas_datareader，需要安装pandas_datareader才可以使用
#安装中出现问题，暂时无法解决

ImportError: The pandas.io.data module is moved to a separate package (pandas-datareader). After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.

In [3]:
import pandas_datareader as pdr
from pandas_datareader import data, wb
#待安装成功后，在使用这个库

ImportError: No module named pandas_datareader

In [None]:
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')

In [None]:
price=DataFrame({tic:data['Adj Close']
                for tic, data in all_data.iteritems()})

In [None]:
volume=DataFrame({tic:data['Volume']
                for tic, data in all_data.iteritems()})

In [None]:
returns=price.pct_change()

In [None]:
returns.tail()

In [4]:
#使用corr()来计算相关系数，使用cov()来计算协方差
#Series中，corr用于计算两个Series中重叠的、非null的、按索引对齐的值的相关系数
#Series中，cov的使用规则类似
returns.MSFT.corr(returns.IBM)

In [None]:
returns.MSFT.cov(returns.IBM)

In [5]:
#DataFrame中，corr和cov将以DataFrame的形式返回完整的相关系数矩阵，和协方差矩阵
returns.corr()

In [None]:
returns.cov()

In [6]:
#使用corrwith可以计算DataFrame与另一个pd对象之间的相关系数
#传入一个Series，返回一个相关系数的Series
#传入一个DataFrame，返回一个按列名配对的相关系数，传入axis=1可以按行配对计算相关系统
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

In [3]:
##唯一值、值计数以及成员资格
obj=Series(['c','a','d','a','a','b','b','c','c'])

In [4]:
#使用unique可以得到Series中的唯一数组
#为了便于查看，结果是按值的频率降序排列的
uniques=obj.unique()

In [5]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [6]:
#使用value_counts来计算数组中各个值出现的次数
#values_counts是个pd的顶级方法，适用于任何数组和序列
pd.value_counts(obj.values, sort=False)


a    3
c    3
b    2
d    1
dtype: int64

In [7]:
#isin用于判断矢量化集合的成员资格，可用于选取Series和DataFrame列中数据的子集
#isin()的含义是，判断pd对象的每个值是否包含在isin后面的数据集中
mask=obj.isin(['b','c'])

In [8]:
mask


0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [9]:
obj[mask]


0    c
5    b
6    b
7    c
8    c
dtype: object

In [10]:
#value_counts会统计DataFrame的值在纵向列上出现的次数
data=DataFrame({'qu1':[1,3,4,3,4],
               'qu2':[2,3,1,2,3],
               'qu3':[1,5,2,4,4]})

In [11]:
data

Unnamed: 0,qu1,qu2,qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [12]:
#将value_counts传给DataFrame的appply函数即可计算
result=data.apply(pd.value_counts).fillna(0)

In [13]:
result

Unnamed: 0,qu1,qu2,qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [14]:
###处理缺失数据
#pd使用浮点值NaN来表示浮点和非浮点数组中的缺失数据
string_data=Series(['aardvark','artichoke',np.nan,'avocado'])

In [15]:
string_data


0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [16]:
string_data.isnull()


0    False
1    False
2     True
3    False
dtype: bool

In [17]:
#python内置的none也会被当做NA处理
string_data[0]=None

In [18]:
string_data.isnull()


0     True
1    False
2     True
3    False
dtype: bool

In [19]:
##过滤缺失的数据:
#使用dropna()去掉空值
from numpy import nan as NA

In [20]:
data=Series([1,NA,3.5,NA,7])

In [21]:
data.dropna()


0    1.0
2    3.5
4    7.0
dtype: float64

In [22]:
#使用布尔型索引的效果类似
data[data.notnull()]


0    1.0
2    3.5
4    7.0
dtype: float64

In [25]:
#在DataFrame中，dropna()默认丢弃所有包含缺失值的行
data=DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [26]:
cleaned=data.dropna()

In [27]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [28]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [29]:
#在dropna()中传入how='all'，将只丢弃全部为null的那些行
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [30]:
#若要丢弃列，传入axis=1即可
data[4]=NA

In [31]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [33]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [34]:
#对于时间序列而言，可以在dropna()中传入thresh属性来保留想要的数据
df=DataFrame(np.random.randn(7,3))

In [35]:
df.iloc[:4,1]=NA; df.iloc[:2,2]=NA

In [36]:
df

Unnamed: 0,0,1,2
0,1.272031,,
1,-0.544541,,
2,-0.782863,,-0.116558
3,-1.786827,,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [37]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [38]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.782863,,-0.116558
3,-1.786827,,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [39]:
##填充缺失的数据
#使用fillna()函数进行空值的替换
df.fillna(0)

Unnamed: 0,0,1,2
0,1.272031,0.0,0.0
1,-0.544541,0.0,0.0
2,-0.782863,0.0,-0.116558
3,-1.786827,0.0,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [40]:
#在fillna中填充一个字典，则可以实现对不同的列填充不同的值
df.fillna({1:0.5, 3:-1})

Unnamed: 0,0,1,2
0,1.272031,0.5,
1,-0.544541,0.5,
2,-0.782863,0.5,-0.116558
3,-1.786827,0.5,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [41]:
df.fillna({1:0.5, 2:-1})

Unnamed: 0,0,1,2
0,1.272031,0.5,-1.0
1,-0.544541,0.5,-1.0
2,-0.782863,0.5,-0.116558
3,-1.786827,0.5,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [42]:
#pd对象的fillna函数并不会直接对源数据进行修改，只是返回新的对象
#若要对源数据进行修改，可以传入inplace=True参数
_ =df.fillna(0,inplace=True)

In [43]:
df

Unnamed: 0,0,1,2
0,1.272031,0.0,0.0
1,-0.544541,0.0,0.0
2,-0.782863,0.0,-0.116558
3,-1.786827,0.0,-0.46356
4,-0.069932,0.379649,0.626783
5,0.166327,-0.336019,-1.183878
6,1.030294,0.94425,-2.059563


In [44]:
#对reindex有效的参数属性，同样适用于fillna
df=DataFrame(np.random.randn(6,3))

In [45]:
df.iloc[2:,1]=NA; df.iloc[4:,2]=NA

In [46]:
df

Unnamed: 0,0,1,2
0,1.834089,0.525635,-0.534768
1,-0.291716,-0.072689,0.828943
2,3.134806,,-1.572573
3,0.324685,,0.429933
4,0.885251,,
5,-2.146911,,


In [47]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.834089,0.525635,-0.534768
1,-0.291716,-0.072689,0.828943
2,3.134806,-0.072689,-1.572573
3,0.324685,-0.072689,0.429933
4,0.885251,-0.072689,0.429933
5,-2.146911,-0.072689,0.429933


In [48]:
df.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,1.834089,0.525635,-0.534768
1,-0.291716,-0.072689,0.828943
2,3.134806,-0.072689,-1.572573
3,0.324685,,0.429933
4,0.885251,,0.429933
5,-2.146911,,


In [49]:
#可以使用fillna实现许多别的功能
data=Series([1.,NA,3.5,NA,7.])

In [50]:
data.fillna(data.mean())


0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [51]:
#fill的参数详见onenote笔记

In [52]:
###层次化索引
#这是pd的一个重要功能，使得pd对象在一个轴上能够有多个索引级别，从而帮助我们一低纬度形式处理高纬度数据
#先来看看Series的例子
data=Series(np.random.randn(10),
           index=[['a','a','a','b','b','b','c','c','d','d'],
                 [1,2,3,1,2,3,1,2,2,3]])

In [53]:
data


a  1   -1.383301
   2   -1.227107
   3   -0.764084
b  1    1.200556
   2    0.643961
   3   -0.167832
c  1   -0.504376
   2   -0.165049
d  2    0.130386
   3    1.141251
dtype: float64

In [54]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [55]:
#由此选择数据子集变得很简单
data['b']


1    1.200556
2    0.643961
3   -0.167832
dtype: float64

In [56]:
data['b':'c']


b  1    1.200556
   2    0.643961
   3   -0.167832
c  1   -0.504376
   2   -0.165049
dtype: float64

In [58]:
data.loc[['b','d']]


b  1    1.200556
   2    0.643961
   3   -0.167832
d  2    0.130386
   3    1.141251
dtype: float64

In [59]:
#也可以在内层索引中进行子集选取
data[:,2]


a   -1.227107
b    0.643961
c   -0.165049
d    0.130386
dtype: float64

In [60]:
#可以使用unstack和stack方法做多层Series和DataFrame的转换
#unstack和stack互为逆运算
data.unstack()

Unnamed: 0,1,2,3
a,-1.383301,-1.227107,-0.764084
b,1.200556,0.643961,-0.167832
c,-0.504376,-0.165049,
d,,0.130386,1.141251


In [61]:
data.unstack().stack()


a  1   -1.383301
   2   -1.227107
   3   -0.764084
b  1    1.200556
   2    0.643961
   3   -0.167832
c  1   -0.504376
   2   -0.165049
d  2    0.130386
   3    1.141251
dtype: float64

In [62]:
#对于DataFrame每个轴都可以有分层索引
#（好像默认是写在前面的索引是外层索引）
frame=DataFrame(np.arange(12).reshape((4,3)),
               index=[['a','a','b','b'],[1,2,1,2]],
               columns=[['ohio','ohio','colorado'],['green','red','green']])

In [63]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,ohio,ohio,colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,green,red,green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [64]:
#每一层索引都可以命名
frame.index.names=['key1','key2']

In [65]:
frame.columns.names=['state','color']

In [66]:
frame

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [67]:
#多层索引DataFrame做子集选取也更方面
frame['ohio']

Unnamed: 0_level_0,color,green,red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [69]:
#可以单独创建multiindex，然后复用
from pandas import MultiIndex
MultiIndex.from_arrays([['ohio','ohio','colorado'],['green','red','green']],
                       names=['state','color'])

MultiIndex(levels=[[u'colorado', u'ohio'], [u'green', u'red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=[u'state', u'color'])

In [70]:
##重排分级顺序
#使用swaplevel，接受两个索引的编号或名称，并返回一个互换了的新对象
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [71]:
#使用sor_index对某一索引中的值进行排序
#sortlevel已经被停用
frame.sortlevel(1)

  


Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [74]:
frame.sort_index(level='key2')

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [77]:
frame.sort_index(level=1)
#由此可以看出层次化索引中，索引的顺序是沿着轴的顺序依次从外到内排序的
#没有axis属性时，默认为axis=0

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [79]:
frame.sort_index(axis=1,level=1)

Unnamed: 0_level_0,state,colorado,ohio,ohio
Unnamed: 0_level_1,color,green,green,red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [76]:
frame.swaplevel(0,1).sort_index(0)

Unnamed: 0_level_0,state,ohio,ohio,colorado
Unnamed: 0_level_1,color,green,red,green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [80]:
##根据级别汇总统计
#使用汇总函数中的level属性
frame.sum(level='key2')

state,ohio,ohio,colorado
color,green,red,green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [82]:
frame.sum(axis=1, level='color')
#这其实是使用了pd的groupby功能

Unnamed: 0_level_0,color,green,red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [83]:
##使用DataFrame的列
frame=DataFrame({'a':range(7),
                 'b':range(7,0,-1),
                 'c':['one','one','one','two','two','two','two'],
                'd':[0,1,2,0,1,2,3]})

In [84]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [85]:
#使用set_index将一个或多个列转为行索引
#使用reset_index将行索引转到列中
frame2=frame.set_index(['c','d'])

In [86]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [87]:
#被转换为行索引的列默认会被移除，可以设置drop=Frlse来将其保留下来
frame.set_index(['c','d'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [88]:
frame2.reset_index()
#若reset_index()中没有设置任何参数，则将所有行索引都转为列

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [90]:
###其他有关pandas的话题
##整数索引
ser=Series(np.arange(3.))

In [92]:
ser


0    0.0
1    1.0
2    2.0
dtype: float64

In [95]:
ser[-1]
#因为ser的索引是整数徐索引，所以不能使用反向顺序符号

KeyError: -1L

In [96]:
#而对于非整数索引，就可以使用反向顺序符号
ser2=Series(np.arange(3.), index=['a','b','c'])

In [97]:
ser2[-1]

2.0

In [98]:
#当时用整数索引时，ix()所使用的数字切片会直接将切片位置对应到整数索引上，引起闭区间切片
ser.ix[:1]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  



0    0.0
1    1.0
dtype: float64

In [99]:
#所以现在都是用：iat,at,iloc,loc来对数据进行选择
#iget_value；irow；icol都已经停止使用
ser3=Series(range(3), index=[-5,1,3])

In [103]:
ser3


-5    0
 1    1
 3    2
dtype: int64

In [107]:
ser3.iloc[2]

2

In [108]:
ser3.iat[2]

2

In [109]:
frame=DataFrame(np.arange(6).reshape((3,2)), index=[2,0,1])

In [110]:
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [112]:
frame.iloc[0]


0    0
1    1
Name: 2, dtype: int32

In [113]:
frame.iloc[2]


0    4
1    5
Name: 1, dtype: int32

In [114]:
##面板数据
#pd中包含一个panel数据结构，可视为三维的DataFrame或三维的ndarray
#可以使用一个由DataFrame组成的字典或一个三维ndarray来创建Panel
import pandas.io.data as web

ImportError: The pandas.io.data module is moved to a separate package (pandas-datareader). After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.