## Hierarchical Indices and pandas DataFrames

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('data/user_ex.csv')

In [5]:
df

Unnamed: 0,date,language,ex_complete
0,2017-01-01,python,6
1,2017-01-02,python,5
2,2017-01-03,python,10
3,2017-01-01,r,8
4,2017-01-02,r,8
5,2017-01-03,r,8


In [6]:
df.set_index(['date','language'], inplace=True)

In [7]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-02,python,5
2017-01-03,python,10
2017-01-01,r,8
2017-01-02,r,8
2017-01-03,r,8


In [8]:
df.index

MultiIndex(levels=[['2017-01-01', '2017-01-02', '2017-01-03'], ['python', 'r']],
           labels=[[0, 1, 2, 0, 1, 2], [0, 0, 0, 1, 1, 1]],
           names=['date', 'language'])

In [9]:
df.sort_index(inplace=True) #需要先將資料排序

In [10]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-01,python,6
2017-01-01,r,8
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


In [11]:
df.loc[('2017-01-02','r')]

ex_complete    8
Name: (2017-01-02, r), dtype: int64

In [12]:
df.loc['2017-01-02':'2017-01-03']

Unnamed: 0_level_0,Unnamed: 1_level_0,ex_complete
date,language,Unnamed: 2_level_1
2017-01-02,python,5
2017-01-02,r,8
2017-01-03,python,10
2017-01-03,r,8


In [5]:
index = [('台北', 2000), ('台北', 2010),('高雄', 2000), ('高雄', 2010),('台中', 2000),('台中', 2010)]
population = [4000000, 4400000, 1800000, 2200000, 1000000, 2500000]
pop = pd.Series(population, index=index)
pop

(台北, 2000)    4000000
(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
(台中, 2010)    2500000
dtype: int64

In [15]:
pop[('台北', 2010):('台中', 2000)]

(台北, 2010)    4400000
(高雄, 2000)    1800000
(高雄, 2010)    2200000
(台中, 2000)    1000000
dtype: int64

In [6]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['台中', '台北', '高雄'], [2000, 2010]],
           labels=[[1, 1, 2, 2, 0, 0], [0, 1, 0, 1, 0, 1]])

In [7]:
pop = pop.reindex(index)

In [18]:
pop[:, 2000]

台北    4000000
高雄    1800000
台中    1000000
dtype: int64

unstack for data

In [19]:
pop_df = pop.unstack()

In [20]:
pop_df

Unnamed: 0,2000,2010
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [21]:
pop_df.stack()

台中  2000    1000000
    2010    2500000
台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
dtype: int64

In [24]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [30]:
pop_df = pd.DataFrame({'total':pop,
                      'under18':[9289021, 9284001, 4686892, 4312233, 5907311, 7861134]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
台北,2000,4000000,9289021
台北,2010,4400000,9284001
高雄,2000,1800000,4686892
高雄,2010,2200000,4312233
台中,2000,1000000,5907311
台中,2010,2500000,7861134


In [31]:
f_u18 = pop_df['under18']/pop_df['total']

In [33]:
f_u18.unstack()

Unnamed: 0,2000,2010
台中,5.907311,3.144454
台北,2.322255,2.11
高雄,2.603829,1.960106


### 建立multiIndex

In [34]:
df = pd.DataFrame(np.random.rand(4,2),index=[['a','b','c','d'],[1,2,1,2]],
                 columns = ['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.747247,0.353145
b,2,0.180379,0.26125
c,1,0.964087,0.120583
d,2,0.405095,0.641315


In [35]:
data = {('台北', 2010):4444000, ('台北', 2017):4833010, 
        ('台中', 2010):2000123, ('台中', 2017):2800145,('高雄',2010):1800234, ('高雄',2017):2500875}
pd.Series(data)

台中  2010    2000123
    2017    2800145
台北  2010    4444000
    2017    4833010
高雄  2010    1800234
    2017    2500875
dtype: int64

### 明確的MultiIndex建構子

In [36]:
pd.MultiIndex.from_arrays([['a', 'b', 'c', 'd'],[1,2,1,2]])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [37]:
pd.MultiIndex.from_tuples([('a', 1), ('b', 2), ('c', 1), ('d', 2)])

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2]],
           labels=[[0, 1, 2, 3], [0, 1, 0, 1]])

In [38]:
pd.MultiIndex.from_product([['x','y'],[1,2]])

MultiIndex(levels=[['x', 'y'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [39]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [41]:
pop.index.names=['city','year']
pop

city  year
台北    2000    4000000
      2010    4400000
高雄    2000    1800000
      2010    2200000
台中    2000    1000000
      2010    2500000
dtype: int64

### MultiIndex for Columns

In [3]:
index = pd.MultiIndex.from_product([[2013, 2014],[1,2]],
                                  names=['year','visit'])
columns = pd.MultiIndex.from_product([['bob','guido','sue'],['hr','temp']],
                                    names=['subject','type'])
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,35.6,38.0,35.7,34.0,37.0
2013,2,30.0,36.5,43.0,37.4,50.0,36.8
2014,1,30.0,36.6,33.0,39.1,26.0,36.4
2014,2,35.0,34.9,23.0,37.1,39.0,38.2


In [4]:
health_data['guido']

Unnamed: 0_level_0,type,hr,temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,38.0,35.7
2013,2,43.0,37.4
2014,1,33.0,39.1
2014,2,23.0,37.1


### slicing and Indexing a MultiIndex

In [8]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [9]:
pop['台北', 2000]

4000000

In [10]:
pop['台北']

2000    4000000
2010    4400000
dtype: int64

In [16]:
pop.loc['台北', '台中']

KeyError: ('台北', '台中')

In [17]:
pop[:, 2000]

台北    4000000
高雄    1800000
台中    1000000
dtype: int64

In [23]:
pop.loc[['台北'：'高雄']]

SyntaxError: invalid character in identifier (<ipython-input-23-b192017f5398>, line 1)

In [24]:
pop[pop>2000000]

台北  2000    4000000
    2010    4400000
高雄  2010    2200000
台中  2010    2500000
dtype: int64

In [27]:
pop1 = pop.sort_index()

In [28]:
pop1

台中  2000    1000000
    2010    2500000
台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
dtype: int64

In [31]:
pop1.loc['台北'：'高雄']

SyntaxError: invalid character in identifier (<ipython-input-31-3caafa97399d>, line 1)

In [33]:
pop1[['台北','台中']]

台中  2000    1000000
    2010    2500000
台北  2000    4000000
    2010    4400000
dtype: int64

In [34]:
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,35.6,38.0,35.7,34.0,37.0
2013,2,30.0,36.5,43.0,37.4,50.0,36.8
2014,1,30.0,36.6,33.0,39.1,26.0,36.4
2014,2,35.0,34.9,23.0,37.1,39.0,38.2


In [35]:
health_data['guido','hr']

year  visit
2013  1        38.0
      2        43.0
2014  1        33.0
      2        23.0
Name: (guido, hr), dtype: float64

In [36]:
health_data.loc[:, ('bob','hr')]

year  visit
2013  1        30.0
      2        30.0
2014  1        30.0
      2        35.0
Name: (bob, hr), dtype: float64

### sorted and unsorted indices

In [37]:
index = pd.MultiIndex.from_product([['a', 'c', 'b'],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names=['char', 'int']
data

char  int
a     1      0.209470
      2      0.073162
c     1      0.498546
      2      0.809314
b     1      0.236179
      2      0.342004
dtype: float64

In [38]:
data = data.sort_index()
data

char  int
a     1      0.209470
      2      0.073162
b     1      0.236179
      2      0.342004
c     1      0.498546
      2      0.809314
dtype: float64

In [39]:
data['a':'b']

char  int
a     1      0.209470
      2      0.073162
b     1      0.236179
      2      0.342004
dtype: float64

In [40]:
pop

台北  2000    4000000
    2010    4400000
高雄  2000    1800000
    2010    2200000
台中  2000    1000000
    2010    2500000
dtype: int64

In [41]:
pop.unstack()

Unnamed: 0,2000,2010
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [42]:
pop.unstack(level=0)

Unnamed: 0,台中,台北,高雄
2000,1000000,4000000,1800000
2010,2500000,4400000,2200000


In [43]:
pop.unstack(level=1)

Unnamed: 0,2000,2010
台中,1000000,2500000
台北,4000000,4400000
高雄,1800000,2200000


In [44]:
health_data

Unnamed: 0_level_0,subject,bob,bob,guido,guido,sue,sue
Unnamed: 0_level_1,type,hr,temp,hr,temp,hr,temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,30.0,35.6,38.0,35.7,34.0,37.0
2013,2,30.0,36.5,43.0,37.4,50.0,36.8
2014,1,30.0,36.6,33.0,39.1,26.0,36.4
2014,2,35.0,34.9,23.0,37.1,39.0,38.2


In [46]:
data_mean = health_data.mean(level='year')
data_mean

subject,bob,bob,guido,guido,sue,sue
type,hr,temp,hr,temp,hr,temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,30.0,36.05,40.5,36.55,42.0,36.9
2014,32.5,35.75,28.0,38.1,32.5,37.3


In [47]:
data_mean.mean(axis=1, level='type')

type,hr,temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,37.5,36.5
2014,31.0,37.05
