《[利用Python进行数据分析](https://book.douban.com/subject/25779298/)》读书笔记。
 
 [第7章](/2017/03/10/python_data_analysis7.html)  第2节：重塑和轴向旋转

所有用到的数据可以从[作者的 github](https://github.com/wesm/pydata-book)下载。


In [38]:
%pylab inline
import pandas as pd
from pandas import Series, DataFrame
import json

Populating the interactive namespace from numpy and matplotlib


 pandas 有很多 重排表格数据的运算，称为 reshape（重塑）和 pivot（轴向旋转）操作。

# 重塑层次化索引

stack:将数据的列“旋转”为行

unstack：将数据的行“旋转”为列

In [10]:
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [11]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [12]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [16]:
# 默认情况下，unstack处理的是内层的索引，若想别的层次，传入编号或者名称即可，注意最外一层编号为0

result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [17]:
# 也可用列名指定
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [18]:
# 下面看有缺失值的情况,unstack()会标示出缺失值
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [20]:
# stack会滤除缺失数据
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [21]:
# 保留缺失值
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [22]:
# 对DataFrame进行unstack时，作为旋转轴的级别成为结果中最低的,弄到最内层
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [23]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [24]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Ohio,Colorado
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10


# pivot: 将“长格式”转换为“宽格式”

In [32]:
data = pd.read_csv('data/ch07/macrodata.csv')
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))

ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]

Unnamed: 0,date,item,value
0,1959-03-31,realgdp,2710.349
1,1959-03-31,infl,0.0
2,1959-03-31,unemp,5.8
3,1959-06-30,realgdp,2778.801
4,1959-06-30,infl,2.34
5,1959-06-30,unemp,5.1
6,1959-09-30,realgdp,2775.488
7,1959-09-30,infl,2.74
8,1959-09-30,unemp,5.3
9,1959-12-31,realgdp,2785.204


In [33]:
# 将data、item作为行、列名，value填充进二维表
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


In [34]:
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31,realgdp,2710.349,-0.535021
1,1959-03-31,infl,0.0,1.783525
2,1959-03-31,unemp,5.8,2.835291
3,1959-06-30,realgdp,2778.801,-1.444524
4,1959-06-30,infl,2.34,1.728538
5,1959-06-30,unemp,5.1,1.100782
6,1959-09-30,realgdp,2775.488,-1.371209
7,1959-09-30,infl,2.74,1.069021
8,1959-09-30,unemp,5.3,-0.658462
9,1959-12-31,realgdp,2785.204,-2.165827


In [35]:
pivoted = ldata.pivot('date', 'item')
pivoted[:5]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,1.783525,-0.535021,2.835291
1959-06-30,2.34,2778.801,5.1,1.728538,-1.444524,1.100782
1959-09-30,2.74,2775.488,5.3,1.069021,-1.371209,-0.658462
1959-12-31,0.27,2785.204,5.6,-0.322622,-2.165827,-1.525572
1960-03-31,2.31,2847.699,5.2,-1.386987,-0.456043,-0.392422


In [36]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,0.0,2710.349,5.8
1959-06-30,2.34,2778.801,5.1
1959-09-30,2.74,2775.488,5.3
1959-12-31,0.27,2785.204,5.6
1960-03-31,2.31,2847.699,5.2


In [37]:
# pivot其实只是一个“快捷方式而已”， 其本质是用set_index创建层次化索引，再用unstack重塑
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31,0.0,2710.349,5.8,1.783525,-0.535021,2.835291
1959-06-30,2.34,2778.801,5.1,1.728538,-1.444524,1.100782
1959-09-30,2.74,2775.488,5.3,1.069021,-1.371209,-0.658462
1959-12-31,0.27,2785.204,5.6,-0.322622,-2.165827,-1.525572
1960-03-31,2.31,2847.699,5.2,-1.386987,-0.456043,-0.392422
1960-06-30,0.14,2834.39,5.2,-2.086858,0.316907,-1.49259
1960-09-30,2.7,2839.022,5.6,1.509653,-0.776808,0.520116
