In [1]:
import pandas as pd
import numpy as np

### A Multiply Indexed Series

#### The Bad Way

In [2]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [3]:
#Still Easy there
pop[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [4]:
#Convenience ends there

pop[[i for i in pop.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

#### The Better Way: Pandas MultiIndex

In [5]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [6]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [7]:
#access all data that second column is 2010
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

#### MultiIndex as extra Dimension

In [15]:
# The unstack() method will easily convert a multiply-indexed series into a convensionally indexed DataFrame
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [17]:
# stack() is the opposite of unstack()
# each extra level in multi-index represents an extra dimension of data
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [18]:
#with MultiIndex, this is as easy as to add another column to the DataFrame
pop_df = pd.DataFrame({'Total':pop,
                       'Under 18': [9267089, 9284094,
                                    4687374, 4318033,
                                    5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,Total,Under 18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [11]:
# uFuncs also works on hierarchical indices as well
f_u18 = pop_df['Under 18'] / pop_df['Total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [12]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


### Methods of MultiIndex Creation

#### to simply pass a list of two or more index arrays to the constructor

In [20]:
df = pd.DataFrame(np.random.rand(4,2),
                  index=[['a','a','b','b'],[1,2,1,2]],
                  columns=["data1", "data2"])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.424633,0.743632
a,2,0.789501,0.684653
b,1,0.725445,0.502226
b,2,0.102552,0.951637


#### To pass a dictionary a tuple as keys: pandas simply recognize this and use a MultiIndex by default

In [22]:
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
df = pd.Series(data)
df

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

#### Note:Sometimes it's useful to explicitly create MultiIndex

### Explicit MultiIndex Constructors

#### - Creating from arrays

In [23]:
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

#### - Creating from tuples

In [24]:
pd.MultiIndex.from_tuples([('a',1),('a', 2),('b',1),('b',2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

#### - Creating even from Cartesian Product

In [28]:
pd.MultiIndex.from_product([['a','b'],[1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

### MultiIndex Level Names

In [30]:
pop.index.names = ['State', 'Year']
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### MultiIndex For Columns

In [40]:
index = pd.MultiIndex.from_product([[2013, 2014], [1,2]], names=['Year', 'Visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],['HR', 'Temp']], names=['Subject', 'Type'])

data = np.round(np.random.randn(4,6), 1)
data[:,::2] *= 10
data += 37

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,36.6,55.0,36.8,39.0,36.1
2013,2,23.0,36.7,47.0,38.1,30.0,35.7
2014,1,48.0,36.3,65.0,35.3,42.0,37.0
2014,2,33.0,38.1,60.0,38.0,31.0,36.7


In [41]:
#Fundamentally , this is four-dimensional data with dimensions:Year, Visit, Subject, Type

In [60]:
health_data['Bob']

Unnamed: 0_level_0,Type,HR,Temp
Year,Visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,41.0,36.6
2013,2,23.0,36.7
2014,1,48.0,36.3
2014,2,33.0,38.1


## Indexing and Slicing in MultiIndex

### Multiply Indexex Series

In [61]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### - Can access single element by indexing multiple terms

In [62]:
pop['California', 2000]

33871648

#### - MultiIndex also supports Partial Indexing

In [63]:
pop['California']

Year
2000    33871648
2010    37253956
dtype: int64

#### - Partial Slicing

In [64]:
pop['California':'New York']

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

#### - With sorted indices, we can perform partial indexing on lower levels by passing an empty slice in the first index:

In [66]:
pop[:,2000]

State
California    33871648
New York      18976457
Texas         20851820
dtype: int64

#### - Boolean Mask

In [69]:
pop[pop > 22000000]

State       Year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

#### - Fancy Indexing

In [70]:
pop[['California','Texas']]

State       Year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64

### Multiply Indexed DataFrames

In [71]:
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,36.6,55.0,36.8,39.0,36.1
2013,2,23.0,36.7,47.0,38.1,30.0,35.7
2014,1,48.0,36.3,65.0,35.3,42.0,37.0
2014,2,33.0,38.1,60.0,38.0,31.0,36.7


#### - Note: columns are primary in a DataFrame , and the syntax used for multiply indexed Series applies to the columns.

In [72]:
health_data['Guido', 'HR']

Year  Visit
2013  1        55.0
      2        47.0
2014  1        65.0
      2        60.0
Name: (Guido, HR), dtype: float64

#### loc, iloc, ix indexers can be used for single-index cases

In [76]:
health_data.iloc[:2, :2]

Unnamed: 0_level_0,Subject,Bob,Bob
Unnamed: 0_level_1,Type,HR,Temp
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,41.0,36.6
2013,2,23.0,36.7


#### each individual index in loc, iloc can be passed a tuple of multiple indexes

In [79]:
health_data.loc[:,('Bob', 'HR')]

Year  Visit
2013  1        41.0
      2        23.0
2014  1        48.0
      2        33.0
Name: (Bob, HR), dtype: float64

In [80]:
#Working with slices within these index tuples is not especially convenient; trying to
#create a slice within a tuple will lead to a syntax error:

In [81]:
health_data.loc[(:,1), (:,'HR')]

SyntaxError: invalid syntax (<ipython-input-81-67963efa6544>, line 1)

#### Note : you can acomplish this by building explicitly using python build-in slice() Function

#### A better way in this context is to use IndexSlice

In [83]:
idx = pd.IndexSlice
health_data.loc[idx[:,1], idx[:,'HR']]

Unnamed: 0_level_0,Subject,Bob,Guido,Sue
Unnamed: 0_level_1,Type,HR,HR,HR
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,41.0,55.0,39.0
2014,1,48.0,65.0,42.0


## Rearranging Multi-Indices

### Sorted and unsorted indices

#### Many of the MultiIndex slicing operations will fail if the index is not sorted.

In [87]:
index = pd.MultiIndex.from_product([['a','c','b'],[1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.353206
      2      0.517852
c     1      0.415259
      2      0.274167
b     1      0.233315
      2      0.765991
dtype: float64

In [88]:
#error is due to unsorted index
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


#### Unsorted indices can be sorted by using sort_index() and sortlevel() methods

In [91]:
data = data.sort_index()
data

char  int
a     1      0.353206
      2      0.517852
b     1      0.233315
      2      0.765991
c     1      0.415259
      2      0.274167
dtype: float64

In [93]:
data['a':'b']

char  int
a     1      0.353206
      2      0.517852
b     1      0.233315
      2      0.765991
dtype: float64

### Stacking and Unstacking Indices

In [94]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [96]:
pop.unstack(level=0)

State,California,New York,Texas
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [97]:
pop.unstack(level=1)

Year,2000,2010
State,Unnamed: 1_level_1,Unnamed: 2_level_1
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [98]:
#the opposite of unstack is stack
pop.unstack().stack()

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

### Index Setting and resetting

In [99]:
pop

State       Year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [101]:
#Real world dataset is look like this
pop_flat = pop.reset_index(name = 'population')
pop_flat

Unnamed: 0,State,Year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [103]:
pop_flat.set_index(['State', 'Year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
State,Year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


#### Note : find this type of reindexing to be one of the more useful patterns to encounter real-world datasets.

### Data Aggregations on Multi-Indices

#### - level parameter controls which subset of the data the aggregate is computed on.

In [115]:
health_data

Unnamed: 0_level_0,Subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,Visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,41.0,36.6,55.0,36.8,39.0,36.1
2013,2,23.0,36.7,47.0,38.1,30.0,35.7
2014,1,48.0,36.3,65.0,35.3,42.0,37.0
2014,2,33.0,38.1,60.0,38.0,31.0,36.7


In [122]:
data_mean_y= health_data.mean(level='Year')
data_mean_y

Subject,Bob,Bob,Guido,Guido,Sue,Sue
Type,HR,Temp,HR,Temp,HR,Temp
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,32.0,36.65,51.0,37.45,34.5,35.9
2014,40.5,37.2,62.5,36.65,36.5,36.85


In [121]:
data_mean = health_data.mean(level='Visit')
data_mean

Subject,Bob,Bob,Guido,Guido,Sue,Sue
Type,HR,Temp,HR,Temp,HR,Temp
Visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,44.5,36.45,60.0,36.05,40.5,36.55
2,28.0,37.4,53.5,38.05,30.5,36.2


In [125]:
data_mean_y.mean(axis=1, level='Type')

Type,HR,Temp
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,39.166667,36.666667
2014,46.5,36.9
