In [None]:
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

* stack v. unstack
* pd.MultiIndex


## Create multiIndex

### Implicit MultiIndex constructors

In [None]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['col1', 'col2'])
df

In [None]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}

ds = pd.Series(data)
ds

###  Explicit MultiIndex constructors¶

In [None]:
pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

In [None]:
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

In [None]:
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

### Level names

In [None]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=pd.MultiIndex.from_tuples(index))
pop

### With more involved datasets, this can be a useful way to keep track of the meaning of various index values.

In [None]:
pop.index.names = ['state', 'year']
pop

In [None]:
df = pd.DataFrame({'a': range(7), 
                'b': range(7, 0, -1),
                'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 
                'd': [0, 1, 2, 0, 1, 2, 3]}
              )
df

In [None]:
df.set_index(['c','d'])

In [None]:
pop.reset_index(name='population')

## Indexing and Slicing a MultiIndex

In [None]:
index = [('California', 2000), ('California', 2010),
        ('New York', 2000), ('New York', 2010),
          ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
              18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=pd.MultiIndex.from_tuples(index))
pop

In [None]:
pop

In [None]:
pop['California', 2000]

In [None]:
pop['California']

In [None]:
pop.loc['California':'New York']

### Partial slicing is available as well, as long as the `MultiIndex` is sorted

In [None]:
index = [('California', 2000), ('California', 2010),
         ('Texas', 2000), ('Texas', 2010),
        ('New York', 2000), ('New York', 2010)]
populations = [33871648, 37253956,
               20851820, 25145561,
              18976457, 19378102]
pop2 = pd.Series(populations, index=pd.MultiIndex.from_tuples(index))
pop2

In [None]:
try:
    pop2.loc['California':'New York']
except KeyError as e:
    print(type(e))
    print(e)a

### Pandas provides a number of convenience routines to perform this type of sorting; examples are the `sort_index()` and `sortlevel()` methods of the `DataFrame`.

In [None]:
pop2.index

In [None]:
pop2.sort_index()['California':'New York']

In [None]:
pop2.sort_index()['California':'New York']

In [None]:
pop2[:,2000]

In [None]:
pop[['California', 'Texas']]

In [None]:
pop.unstack(level=0)
pop.unstack(level=1)

## Index setting and resetting

In [None]:
pop.index.names = ['state','year']
pop

In [None]:
pop_flat = pop.reset_index(name='population')
pop_flat

### Often when working with data in the real world, the raw input data looks like this and it's useful to build a `MultiIndex` from the column values.

In [None]:
pop_flat.set_index(['state', 'year'])

## MultiIndex for columns

In [None]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

In [None]:
health_data.iloc[0:2,3:5]

In [None]:
health_data.loc[2013,['Guido','Sue']]

In [None]:
health_data.loc[:, ('Bob', 'HR')]

[multiIndex](https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html)

### Using slicers

In [None]:
health_data.loc[(slice(None),1), ('Bob', 'HR')]

In [None]:
idx = pd.IndexSlice

In [None]:
health_data.loc[idx[:,1], idx['Bob','HR']]

In [None]:
idx[:,1]

## Data Aggregation

In [None]:
health_data

In [None]:
data_mean = health_data.mean(level='year')
data_mean

In [None]:
health_data.mean(axis=1, level='type')

### How to calculate the mean of each `type` for each `year`

In [None]:
health_data.stack().stack().groupby(['year', 'type']).mean()