# Multi-Index

In [7]:
import pandas as pd
import numpy as np

In [1]:
def make_df(cols, ind):
           """Quickly make a DataFrame"""
           data = {c: [str(c) + str(i) for i in ind]
                   for c in cols}
           return pd.DataFrame(data, ind)

In [3]:
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [4]:
x = [[1, 2], [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [5]:
np.concatenate([x, x], axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

In [6]:
x

[[1, 2], [3, 4]]

In [7]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
# ser2 = pd.Series(['D', 'E', 'F'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
ser = pd.concat([ser1, ser2])
ser.index
# len(ser)

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')

In [8]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1)
print(df2)

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


In [9]:
print(pd.concat([df1, df2]))

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [6]:
index = [('California', 2000), ('California', 2010), ('New York', 2000),\
         ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [7]:
pop.index

Index([('California', 2000), ('California', 2010),   ('New York', 2000),
         ('New York', 2010),      ('Texas', 2000),      ('Texas', 2010)],
      dtype='object')

In [11]:
for i in pop.index: 
    if i[1] == 2010:
        print(i[0])

California
New York
Texas


In [12]:
for i in pop.index: 
    if i[0] == 'California':
        print(i[1])

2000
2010


In [12]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [13]:
index.levels

FrozenList([['California', 'New York', 'Texas'], [2000, 2010]])

In [14]:
index.codes

FrozenList([[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [15]:
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [16]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [17]:
pop[['Texas']]

Texas  2000    20851820
       2010    25145561
dtype: int64

In [18]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [19]:
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [21]:
f_u18 = pop_df['under18'] / pop_df['total']
f_u18

California  2000    0.273594
            2010    0.249211
New York    2000    0.247010
            2010    0.222831
Texas       2000    0.283251
            2010    0.273568
dtype: float64

In [22]:
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [15]:
index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]]
df = pd.DataFrame(np.random.rand(4, 2), index=index, columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.38514,0.246362
a,2,0.035291,0.420658
b,1,0.731305,0.885236
b,2,0.913793,0.007135


In [16]:
index

[['a', 'a', 'b', 'b'], [1, 2, 1, 2]]

In [5]:
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']], names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,48.0,38.6,40.0,37.6,34.0,36.5
2013,2,43.0,36.8,25.0,37.3,48.0,36.5
2014,1,34.0,36.0,37.0,35.5,33.0,37.3
2014,2,39.0,35.7,34.0,38.0,36.0,36.8


In [36]:
index

MultiIndex([(2013, 1),
            (2013, 2),
            (2014, 1),
            (2014, 2)],
           names=['year', 'visit'])

In [37]:
columns

MultiIndex([(  'Bob',   'HR'),
            (  'Bob', 'Temp'),
            ('Guido',   'HR'),
            ('Guido', 'Temp'),
            (  'Sue',   'HR'),
            (  'Sue', 'Temp')],
           names=['subject', 'type'])

In [41]:
health_data['Sue']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,38.0,37.3
2013,2,42.0,34.8
2014,1,39.0,37.0
2014,2,27.0,35.6


In [52]:
health_data.loc[:, ('Bob', 'Temp')]

year  visit
2013  1        37.2
      2        36.9
2014  1        37.1
      2        37.6
Name: (Bob, Temp), dtype: float64

In [54]:
health_data.unstack()

subject,Bob,Bob,Bob,Bob,Guido,Guido,Guido,Guido,Sue,Sue,Sue,Sue
type,HR,HR,Temp,Temp,HR,HR,Temp,Temp,HR,HR,Temp,Temp
visit,1,2,1,2,1,2,1,2,1,2,1,2
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2013,31.0,35.0,37.2,36.9,49.0,38.0,36.5,39.0,38.0,42.0,37.3,34.8
2014,31.0,21.0,37.1,37.6,39.0,30.0,37.0,36.1,39.0,27.0,37.0,35.6


In [55]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,31.0,37.2,49.0,36.5,38.0,37.3
2013,2,35.0,36.9,38.0,39.0,42.0,34.8
2014,1,31.0,37.1,39.0,37.0,39.0,37.0
2014,2,21.0,37.6,30.0,36.1,27.0,35.6


In [56]:
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,33.0,37.05,43.5,37.75,40.0,36.05
2014,26.0,37.35,34.5,36.55,33.0,36.3


In [57]:
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,38.833333,36.95
2014,31.166667,36.733333


In [60]:
data_mean.sum(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,116.5,110.85
2014,93.5,110.2


In [55]:
state = ['California', 'California', 'New York', 'New York', 'Texas', 'Texas']
year = [2000, 2010, 2000, 2010, 2000, 2010]
arrays = [state, year]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
index = pd.MultiIndex.from_arrays(arrays)
pop2 = pd.DataFrame(populations, index =index, columns=['Population'])
pop2

Unnamed: 0,Unnamed: 1,Population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [62]:
index2 = pd.MultiIndex.from_arrays([state, year])
pop3 = pd.DataFrame(populations, index =index2, columns=['Population'])
pop3

Unnamed: 0,Unnamed: 1,Population
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [48]:
pop2.loc['California']

Unnamed: 0,Population
2000,33871648
2010,37253956


In [61]:
pop2.loc[('California', 2010)]

Population    37253956
Name: (California, 2010), dtype: int64

In [50]:
data_mean = pop2.mean(level=0)
data_mean

Unnamed: 0,Population
California,35562802.0
New York,19177279.5
Texas,22998690.5


In [51]:
data_mean = pop2.mean(level=1)
data_mean

Unnamed: 0,Population
2000,24566640.0
2010,27259210.0


## Creating a MultiIndex (hierarchical index) object

https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html

In [1]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [9]:
tuples = list(zip(*arrays))

In [3]:
tuples

[(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],),
 (['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],)]

In [10]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [11]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [12]:
s = pd.Series(np.random.randn(8), index=index)

In [13]:
s

first  second
bar    one      -0.062268
       two       0.014994
baz    one      -1.386046
       two      -0.639692
foo    one      -1.123991
       two      -2.147776
qux    one      -0.255966
       two       0.756697
dtype: float64

In [14]:
s.index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [17]:
s

first  second
bar    one      -0.062268
       two       0.014994
baz    one      -1.386046
       two      -0.639692
foo    one      -1.123991
       two      -2.147776
qux    one      -0.255966
       two       0.756697
dtype: float64

In [16]:
s + s[:-2]

first  second
bar    one      -0.124535
       two       0.029989
baz    one      -2.772091
       two      -1.279384
foo    one      -2.247982
       two      -4.295552
qux    one            NaN
       two            NaN
dtype: float64

In [26]:
s[:-6]

first  second
bar    one      -0.062268
       two       0.014994
dtype: float64

In [28]:
s[0:4]

first  second
bar    one      -0.062268
       two       0.014994
baz    one      -1.386046
       two      -0.639692
dtype: float64

In [31]:
s.reindex()

first  second
bar    one      -0.062268
       two       0.014994
baz    one      -1.386046
       two      -0.639692
foo    one      -1.123991
       two      -2.147776
qux    one      -0.255966
       two       0.756697
dtype: float64

## Advanced reindexing and alignment

In [34]:
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
                     codes=[[1, 1, 0, 0], [1, 0, 1, 0]])

In [33]:
df = pd.DataFrame(np.random.randn(4, 2), index=midx)
df

Unnamed: 0,Unnamed: 1,0,1
one,y,-0.318801,-0.490244
one,x,-1.35566,-2.155919
zero,y,0.812477,-0.989396
zero,x,0.334909,-0.765347


In [36]:
df2 = df.sum(level=0)
df2

Unnamed: 0,0,1
one,-1.674461,-2.646163
zero,1.147386,-1.754742


In [37]:
df2.reindex(df.index, level=0)

Unnamed: 0,Unnamed: 1,0,1
one,y,-1.674461,-2.646163
one,x,-1.674461,-2.646163
zero,y,1.147386,-1.754742
zero,x,1.147386,-1.754742


In [38]:
df_aligned, df2_aligned = df.align(df2, level=0)

In [39]:
df_aligned

Unnamed: 0,Unnamed: 1,0,1
one,y,-0.318801,-0.490244
one,x,-1.35566,-2.155919
zero,y,0.812477,-0.989396
zero,x,0.334909,-0.765347


In [40]:
df2_aligned

Unnamed: 0,Unnamed: 1,0,1
one,y,-1.674461,-2.646163
one,x,-1.674461,-2.646163
zero,y,1.147386,-1.754742
zero,x,1.147386,-1.754742


In [41]:
df[:5]

Unnamed: 0,Unnamed: 1,0,1
one,y,-0.318801,-0.490244
one,x,-1.35566,-2.155919
zero,y,0.812477,-0.989396
zero,x,0.334909,-0.765347


In [42]:
df[:5].swaplevel(0, 1, axis=0)

Unnamed: 0,Unnamed: 1,0,1
y,one,-0.318801,-0.490244
x,one,-1.35566,-2.155919
y,zero,0.812477,-0.989396
x,zero,0.334909,-0.765347


In [45]:
df[:5].swaplevel()

Unnamed: 0,Unnamed: 1,0,1
y,one,-0.318801,-0.490244
x,one,-1.35566,-2.155919
y,zero,0.812477,-0.989396
x,zero,0.334909,-0.765347


## Sorting a MultiIndex

In [47]:
import random

In [48]:
random.shuffle(tuples)

In [49]:
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))

In [50]:
s

foo  two   -0.375401
baz  two   -0.386903
     one    1.226636
bar  two    0.155815
qux  one   -0.044772
bar  one    0.228158
qux  two    0.465785
foo  one   -0.744137
dtype: float64

In [61]:
s = s.sort_index()

In [53]:
s.sort_index(level=1)

bar  one    0.228158
baz  one    1.226636
foo  one   -0.744137
qux  one   -0.044772
bar  two    0.155815
baz  two   -0.386903
foo  two   -0.375401
qux  two    0.465785
dtype: float64

In [54]:
s.index.set_names(['L1', 'L2'], inplace=True)

In [55]:
s.sort_index(level='L1')

L1   L2 
bar  one    0.228158
     two    0.155815
baz  one    1.226636
     two   -0.386903
foo  one   -0.744137
     two   -0.375401
qux  one   -0.044772
     two    0.465785
dtype: float64

In [62]:
s.index.is_lexsorted()

True

In [63]:
s.index.lexsort_depth

2

## Take methods

In [66]:
index = pd.Index(np.random.randint(0, 1000, 10))
index

Int64Index([536, 724, 108, 343, 570, 658, 997, 954, 516, 297], dtype='int64')

In [67]:
positions = [0, 9, 3]

In [68]:
index[positions]

Int64Index([536, 297, 343], dtype='int64')

In [69]:
index.take(positions)

Int64Index([536, 297, 343], dtype='int64')

In [70]:
ser = pd.Series(np.random.randn(10))
ser

0   -2.296926
1    0.508348
2   -0.265144
3   -1.301210
4   -0.608755
5    1.453753
6    1.534709
7    0.466236
8   -0.954846
9    0.911553
dtype: float64

In [71]:
ser.iloc[positions]

0   -2.296926
9    0.911553
3   -1.301210
dtype: float64

In [73]:
ser.take(positions)

0   -2.296926
9    0.911553
3   -1.301210
dtype: float64