*This notebook contains an excerpt from the [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas; the content is available [on GitHub](https://github.com/jakevdp/PythonDataScienceHandbook).*

# Hierarchical Indexing

In [1]:
import pandas as pd
import numpy as np

In [2]:
population_data = [{'state': 'California', 'year': '2000', 'population': 33871648},
 {'state': 'California', 'year': '2010', 'population': 37253956},
 {'state': 'New York', 'year': '2000', 'population': 18976457},
 {'state': 'New York', 'year': '2010', 'population': 19378102},
 {'state': 'Texas', 'year': '2000', 'population': 20851820},
 {'state': 'Texas', 'year': '2010', 'population': 25145561}]

In [3]:
pop_df = pd.DataFrame(population_data)
pop_df

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


Select records of California

In [12]:
pop_df[pop_df['state']=='California']

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956


In [6]:
pop_df2 = pop_df.set_index('state')
pop_df2

Unnamed: 0_level_0,year,population
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [13]:
pop_df2.loc['California']

Unnamed: 0_level_0,year,population
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956


In [20]:
pop_df2.reset_index()

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


What is the population of California in year 2000?

In [21]:
q = pop_df[(pop_df['year']=='2000') & (pop_df['state']=='California')]
q

Unnamed: 0,state,year,population
0,California,2000,33871648


In [22]:
q.loc[0, 'population']

33871648

In [10]:
pop_df2

Unnamed: 0_level_0,population,year
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,2000
California,37253956,2010
New York,18976457,2000
New York,19378102,2010
Texas,20851820,2000
Texas,25145561,2010


In [23]:
pop_df2[pop_df2['year']=='2000'].loc['California', 'population']

33871648

In [12]:
pop_df

Unnamed: 0,population,state,year
0,33871648,California,2000
1,37253956,California,2010
2,18976457,New York,2000
3,19378102,New York,2010
4,20851820,Texas,2000
5,25145561,Texas,2010


In [24]:
pop_df3 = pop_df.set_index(['state', 'year'])
pop_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [25]:
pop_df3.index

MultiIndex([('California', '2000'),
            ('California', '2010'),
            (  'New York', '2000'),
            (  'New York', '2010'),
            (     'Texas', '2000'),
            (     'Texas', '2010')],
           names=['state', 'year'])

In [26]:
pop_df3.loc['California']

Unnamed: 0_level_0,population
year,Unnamed: 1_level_1
2000,33871648
2010,37253956


In [27]:
pop_df3.loc[('California', '2010')]

population    37253956
Name: (California, 2010), dtype: int64

In [30]:
#pop_df.set_index(['year', 'state']).loc['2010']

In [20]:
pop_df3.loc[(:, '2010')]

SyntaxError: invalid syntax (<ipython-input-20-cf78904ad9b7>, line 1)

In [35]:
pop_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [32]:
idx = pd.IndexSlice
pop_df3.loc[idx[:, '2010'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2010,37253956
New York,2010,19378102
Texas,2010,25145561


In [39]:
pop_df3.loc[idx[['Texas', 'New York'], '2000':'2010'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [24]:
pop_df3.T.loc[:,idx[:, '2010']]

state,California,New York,Texas
year,2010,2010,2010
population,37253956,19378102,25145561


### MultiIndex as extra dimension

You might notice something else here: we could easily have stored the same data using a simple ``DataFrame`` with index and column labels.
In fact, Pandas is built with this equivalence in mind. The ``unstack()`` method will quickly convert a multiply indexed ``Series`` into a conventionally indexed ``DataFrame``:

In [40]:
pop_df3['under18'] = [9267089, 9284094,
                      4687374, 4318033,
                      5906301, 6879014]
pop_df3

Unnamed: 0_level_0,Unnamed: 1_level_0,population,under18
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [46]:
f_u18 = pop_df3['under18'] / pop_df3['population']
f_u18

state       year
California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [47]:
f_u18.unstack()

year,2000,2010
state,Unnamed: 1_level_1,Unnamed: 2_level_1
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [23]:
f_u18.unstack('state')

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,0.273594,0.24701,0.283251
2010,0.249211,0.222831,0.273568


In [24]:
f_u18.unstack('state').stack()

year  state     
2000  California    0.273594
      New York      0.247010
      Texas         0.283251
2010  California    0.249211
      New York      0.222831
      Texas         0.273568
dtype: float64

# Using multi-index to calculate aggregations

In [25]:
f_u18

state       year
California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [48]:
f_u18.mean()

0.25824409535188225

In [49]:
f_u18.unstack('state')

state,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,0.273594,0.24701,0.283251
2010,0.249211,0.222831,0.273568


In [50]:
f_u18.unstack('state').mean()

state
California    0.261403
New York      0.234920
Texas         0.278409
dtype: float64

In [51]:
f_u18.unstack('year').mean()

year
2000    0.267952
2010    0.248536
dtype: float64