## Hierarchical Indices and pandas DataFrames

資料如果需要「多重索引」(multiIndexing)則稱為 Hierarchical Indices。在 pandas 中， 「索引」(index)用來存取資料的參考。

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [1, 2, 3, 4, 5]
df = pd.Series(data)
df.index.name='index'
df

index
0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
indexss = ['中華科大','台科大','北科大','雲科大','龍華科大']
df=pd.Series(data, index=indexss)
df

中華科大    1
台科大     2
北科大     3
雲科大     4
龍華科大    5
dtype: int64

multiIndexing範例

In [6]:
df = pd.read_csv('data/user_ex.csv')

In [7]:
df.head()

Unnamed: 0,date,language,ex_complete
0,2017-01-01,python,6
1,2017-01-02,python,5
2,2017-01-03,python,10
3,2017-01-01,r,8
4,2017-01-02,r,8


In [8]:
df.set_index(['date','language'], inplace=True)

In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-02,python,5
2017-01-03,python,10
2017-01-01,r,8
2017-01-02,r,8
2017-01-03,r,8


In [10]:
df.index

MultiIndex(levels=[['2017-01-01', '2017-01-02', '2017-01-03'], ['python', 'r']],
           labels=[[0, 1, 2, 0, 1, 2], [0, 0, 0, 1, 1, 1]],
           names=['date', 'language'])

In [11]:
df.sort_index(inplace=True) #根據索引將資料排序

In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-01,r,8
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


In [13]:
df.loc[('2017-01-03','python')]

ex_complete    10
Name: (2017-01-03, python), dtype: int64

In [14]:
df.loc['2017-01-02':'2017-01-03']

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


城市人口範例

In [38]:
index = [('台北', 2000), ('台北', 2010),('高雄', 2000), ('高雄', 2010),('台中', 2000),('台中', 2010)]
population = [4000000, 4400000, 1800000, 2200000, 1000000, 2500000]
pop = pd.Series(population, index=index)
pop.index.name = 'city/year'
pop

city/year
(台北, 2000)    4000000
(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
(台中, 2010)    2500000
dtype: int64

In [39]:
pop[('台北', 2010):('台中', 2000)]

city/year
(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
dtype: int64

In [40]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['台中', '台北', '高雄'], [2000, 2010]],
           labels=[[1, 1, 2, 2, 0, 0], [0, 1, 0, 1, 0, 1]])

In [41]:
pop = pop.reindex(index)
#pop.index.name = ['city/year','population']

In [48]:
pop[:,2000]

台北    4000000
高雄    1800000
台中    1000000
dtype: int64

In [43]:
pop.head()

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
dtype: int64

unstack for data

In [44]:
pop_df = pop.unstack()

In [45]:
pop_df

Unnamed: 0,2000,2010
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [46]:
pop_df.stack()

台中  2000    1000000
    2010    2500000
台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
dtype: int64

In [47]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [49]:
pop_df = pd.DataFrame({'total':pop,
                      'under18':[9289021, 9284001, 4686892, 4312233, 5907311, 7861134]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
台北,2000,4000000,9289021
台北,2010,4400000,9284001
高雄,2000,1800000,4686892
高雄,2010,2200000,4312233
台中,2000,1000000,5907311
台中,2010,2500000,7861134


In [50]:
f_u18 = pop_df['under18']/pop_df['total']

In [53]:
f_u18.unstack()

Unnamed: 0,2000,2010
台中,5.907311,3.144454
台北,2.322255,2.11
高雄,2.603829,1.960106


### 建立multiIndex

In [54]:
df = pd.DataFrame(np.random.rand(4,2),index=[['a','b','c','d'],[1,2,1,2]],
                 columns = ['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.242607,0.33316
b,2,0.173339,0.380204
c,1,0.43631,0.645894
d,2,0.05595,0.717942


In [55]:
data = {('台北', 2010):4444000, ('台北', 2017):4833010, 
        ('台中', 2010):2000123, ('台中', 2017):2800145,('高雄',2010):1800234, ('高雄',2017):2500875}
pd.Series(data)

台中  2010    2000123
    2017    2800145
台北  2010    4444000
    2017    4833010
高雄  2010    1800234
    2017    2500875
dtype: int64

### 明確的MultiIndex建構子

In [125]:
pd.MultiIndex.from_arrays([['a', 'b', 'c', 'd'],[1,2,1,2]])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [126]:
pd.MultiIndex.from_tuples([('a', 1), ('b', 2), ('c', 1), ('d', 2)])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [127]:
pd.MultiIndex.from_product([['x','y'],[1,2]])

MultiIndex(levels=[['x', 'y'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [56]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [57]:
pop.index.names=['city','year']
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

### MultiIndex for Columns

In [58]:
index = pd.MultiIndex.from_product([[2013, 2014],[1,2]],
                                  names=['year','visit'])
columns = pd.MultiIndex.from_product([['bob','guido','sue'],['hr','temp']],
                                    names=['subject','type'])
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,35.6,25.0,37.8,37.0,36.6
2013,2,46.0,38.0,46.0,37.1,34.0,36.5
2014,1,27.0,37.0,54.0,37.4,39.0,38.3
2014,2,50.0,37.5,38.0,37.2,30.0,36.1


In [59]:
health_data['guido']

Unnamed: 0_level_0,type,hr,temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,25.0,37.8
2013,2,46.0,37.1
2014,1,54.0,37.4
2014,2,38.0,37.2


### slicing and Indexing a MultiIndex

In [132]:
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

In [133]:
pop['台北', 2000]

4000000

In [134]:
pop['台北']

year
2000    4000000
2010    4400000
dtype: int64

In [62]:
pop.loc[['台中', '台北'],:]

UnsortedIndexError: 'MultiIndex Slicing requires the index to be fully lexsorted tuple len (2), lexsort depth (0)'

In [63]:
pop[:, 2000]

city
台北    4000000
高雄    1800000
台中    1000000
dtype: int64

In [67]:
pop.loc['台北':'高雄']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [68]:
pop[pop>2000000]

city  year
台北    2000    4000000
      2010    4400000
高雄    2010    2200000
台中    2010    2500000
dtype: int64

In [69]:
pop1 = pop.sort_index()

In [70]:
pop1

city  year
台中    2000    1000000
      2010    2500000
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
dtype: int64

In [72]:
pop1.loc['台北'：'高雄']

SyntaxError: invalid character in identifier (<ipython-input-72-104848a22c16>, line 1)

In [73]:
pop1[['台北','台中']]

city  year
台中    2000    1000000
      2010    2500000
台北    2000    4000000
      2010    4400000
dtype: int64

In [74]:
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,35.6,25.0,37.8,37.0,36.6
2013,2,46.0,38.0,46.0,37.1,34.0,36.5
2014,1,27.0,37.0,54.0,37.4,39.0,38.3
2014,2,50.0,37.5,38.0,37.2,30.0,36.1


In [75]:
health_data['guido','hr']

year  visit
2013  1        25.0
      2        46.0
2014  1        54.0
      2        38.0
Name: (guido, hr), dtype: float64

In [78]:
health_data.loc[(2013,:), ('bob','hr')]

SyntaxError: invalid syntax (<ipython-input-78-bf0376d334ad>, line 1)

### sorted and unsorted indices

In [79]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['char', 'int']
data

char  int
a     1      0.677996
      2      0.599065
c     1      0.889262
      2      0.958633
b     1      0.232110
      2      0.087626
dtype: float64

In [80]:
data = data.sort_index()
data

char  int
a     1      0.677996
      2      0.599065
b     1      0.232110
      2      0.087626
c     1      0.889262
      2      0.958633
dtype: float64

In [81]:
data['a':'b']

char  int
a     1      0.677996
      2      0.599065
b     1      0.232110
      2      0.087626
dtype: float64

In [82]:
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

In [83]:
pop.unstack()

year,2000,2010
city,Unnamed: 1_level_1,Unnamed: 2_level_1
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [84]:
pop.unstack(level=0)

city,台中,台北,高雄
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1000000,4000000,1800000
2010,2500000,4400000,2200000


In [85]:
pop.unstack(level=1)

year,2000,2010
city,Unnamed: 1_level_1,Unnamed: 2_level_1
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [86]:
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,35.6,25.0,37.8,37.0,36.6
2013,2,46.0,38.0,46.0,37.1,34.0,36.5
2014,1,27.0,37.0,54.0,37.4,39.0,38.3
2014,2,50.0,37.5,38.0,37.2,30.0,36.1


In [87]:
data_mean = health_data.mean(level='year')
data_mean

subject,bob,bob,guido,guido,sue,sue
type,hr,temp,hr,temp,hr,temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,35.5,36.8,35.5,37.45,35.5,36.55
2014,38.5,37.25,46.0,37.3,34.5,37.2


In [88]:
data_mean.mean(axis=1, level='type')

type,hr,temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,35.5,36.933333
2014,39.666667,37.25
