## Hierarchical Indices and pandas DataFrames

資料如果需要「多重索引」(multiIndexing)則稱為 Hierarchical Indices。在 pandas 中， 「索引」(index)用來存取資料的參考。

In [98]:
import pandas as pd
import numpy as np

In [99]:
data = [1, 2, 3, 4, 5]
df = pd.Series(data)
df.index.name='index'
df

index
0    1
1    2
2    3
3    4
4    5
dtype: int64

In [100]:
indexss = list('abcde')
df=pd.Series(data, index=indexss)
df

a    1
b    2
c    3
d    4
e    5
dtype: int64

multiIndexing範例

In [101]:
df = pd.read_csv('data/user_ex.csv')

In [102]:
df.head()

Unnamed: 0,date,language,ex_complete
0,2017-01-01,python,6
1,2017-01-02,python,5
2,2017-01-03,python,10
3,2017-01-01,r,8
4,2017-01-02,r,8


In [103]:
df.set_index(['date','language'], inplace=True)

In [104]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-02,python,5
2017-01-03,python,10
2017-01-01,r,8
2017-01-02,r,8
2017-01-03,r,8


In [105]:
df.index

MultiIndex(levels=[['2017-01-01', '2017-01-02', '2017-01-03'], ['python', 'r']],
           labels=[[0, 1, 2, 0, 1, 2], [0, 0, 0, 1, 1, 1]],
           names=['date', 'language'])

In [106]:
df.sort_index(inplace=True) #根據索引將資料排序

In [107]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-01,r,8
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


In [108]:
df.loc[('2017-01-02','r')]

ex_complete    8
Name: (2017-01-02, r), dtype: int64

In [109]:
df.loc['2017-01-02':'2017-01-03']

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


城市人口範例

In [110]:
index = [('台北', 2000), ('台北', 2010),('高雄', 2000), ('高雄', 2010),('台中', 2000),('台中', 2010)]
population = [4000000, 4400000, 1800000, 2200000, 1000000, 2500000]
pop = pd.Series(population, index=index)
pop.index.name = 'city/year'
pop

city/year
(台北, 2000)    4000000
(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
(台中, 2010)    2500000
dtype: int64

In [111]:
pop[('台北', 2010):('台中', 2000)]

city/year
(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
dtype: int64

In [112]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['台中', '台北', '高雄'], [2000, 2010]],
           labels=[[1, 1, 2, 2, 0, 0], [0, 1, 0, 1, 0, 1]])

In [113]:
pop = pop.reindex(index)
pop.index.name = ['city/year','population']

In [114]:
pop[:, 2000]

台北    4000000
高雄    1800000
台中    1000000
dtype: int64

In [115]:
pop.head()

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
dtype: int64

unstack for data

In [116]:
pop_df = pop.unstack()

In [117]:
pop_df

Unnamed: 0,2000,2010
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [118]:
pop_df.stack()

台中  2000    1000000
    2010    2500000
台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
dtype: int64

In [119]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [120]:
pop_df = pd.DataFrame({'total':pop,
                      'under18':[9289021, 9284001, 4686892, 4312233, 5907311, 7861134]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
台北,2000,4000000,9289021
台北,2010,4400000,9284001
高雄,2000,1800000,4686892
高雄,2010,2200000,4312233
台中,2000,1000000,5907311
台中,2010,2500000,7861134


In [121]:
f_u18 = pop_df['under18']/pop_df['total']

In [122]:
f_u18.unstack()

Unnamed: 0,2000,2010
台中,5.907311,3.144454
台北,2.322255,2.11
高雄,2.603829,1.960106


### 建立multiIndex

In [123]:
df = pd.DataFrame(np.random.rand(4,2),index=[['a','b','c','d'],[1,2,1,2]],
                 columns = ['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.521641,0.731592
b,2,0.838169,0.872393
c,1,0.914378,0.082406
d,2,0.162389,0.284528


In [124]:
data = {('台北', 2010):4444000, ('台北', 2017):4833010, 
        ('台中', 2010):2000123, ('台中', 2017):2800145,('高雄',2010):1800234, ('高雄',2017):2500875}
pd.Series(data)

台中  2010    2000123
    2017    2800145
台北  2010    4444000
    2017    4833010
高雄  2010    1800234
    2017    2500875
dtype: int64

### 明確的MultiIndex建構子

In [125]:
pd.MultiIndex.from_arrays([['a', 'b', 'c', 'd'],[1,2,1,2]])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [126]:
pd.MultiIndex.from_tuples([('a', 1), ('b', 2), ('c', 1), ('d', 2)])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [127]:
pd.MultiIndex.from_product([['x','y'],[1,2]])

MultiIndex(levels=[['x', 'y'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [128]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [129]:
pop.index.names=['city','year']
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

### MultiIndex for Columns

In [130]:
index = pd.MultiIndex.from_product([[2013, 2014],[1,2]],
                                  names=['year','visit'])
columns = pd.MultiIndex.from_product([['bob','guido','sue'],['hr','temp']],
                                    names=['subject','type'])
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,24.0,37.5,42.0,38.1,37.0,35.7
2013,2,37.0,36.4,38.0,39.1,35.0,36.8
2014,1,29.0,36.5,37.0,36.3,36.0,37.6
2014,2,23.0,37.2,19.0,37.1,41.0,36.4


In [131]:
health_data['guido']

Unnamed: 0_level_0,type,hr,temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,42.0,38.1
2013,2,38.0,39.1
2014,1,37.0,36.3
2014,2,19.0,37.1


### slicing and Indexing a MultiIndex

In [132]:
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

In [133]:
pop['台北', 2000]

4000000

In [134]:
pop['台北']

year
2000    4000000
2010    4400000
dtype: int64

In [135]:
pop.loc['台中', '台北']

KeyError: ('台中', '台北')

In [None]:
pop[:, 2000]

In [None]:
pop.loc[['台北'：'高雄']]

In [None]:
pop[pop>2000000]

In [None]:
pop1 = pop.sort_index()

In [None]:
pop1

In [None]:
pop1.loc['台北'：'高雄']

In [None]:
pop1[['台北','台中']]

In [None]:
health_data

In [None]:
health_data['guido','hr']

In [None]:
health_data.loc[:, ('bob','hr')]

### sorted and unsorted indices

In [None]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['char', 'int']
data

In [None]:
data = data.sort_index()
data

In [None]:
data['a':'b']

In [None]:
pop

In [None]:
pop.unstack()

In [None]:
pop.unstack(level=0)

In [None]:
pop.unstack(level=1)

In [None]:
health_data

In [None]:
data_mean = health_data.mean(level='year')
data_mean

In [None]:
data_mean.mean(axis=1, level='type')