In [1]:
import numpy as np
import pandas as pd

In [12]:
a = np.random.randint(10,size=4)
a

array([8, 2, 1, 5])

In [13]:
index = [
    ('a',0),
    ('a',1),
    ('b',0),
    ('b',1)    
]
index

[('a', 0), ('a', 1), ('b', 0), ('b', 1)]

In [14]:
pd.Series(a, index = index)

(a, 0)    8
(a, 1)    2
(b, 0)    1
(b, 1)    5
dtype: int32

In [17]:
multiindex = pd.MultiIndex.from_tuples(index)
multiindex

MultiIndex(levels=[['a', 'b'], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [42]:
s = pd.Series(a, index=multiindex)
print(s)
s.loc[:,0] # double indexing works on series. But not on data frame

a  0    8
   1    2
b  0    1
   1    5
dtype: int32


a    8
b    1
dtype: int32

In [21]:
s.unstack()

Unnamed: 0,0,1
a,8,2
b,1,5


# Ways to create multi index

## 1. from tuples

In [39]:
pd.MultiIndex.from_tuples([    ('a',0),
    ('a',1),
    ('b',0),
    ('b',1) ])

MultiIndex(levels=[['a', 'b'], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

## 2. From arrays

In [38]:
pd.MultiIndex.from_arrays([
    ['a','a','b','b'],
    ['0','1','0','1']
]    )

MultiIndex(levels=[['a', 'b'], ['0', '1']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

## 3. From cartesian products

In [37]:
pd.MultiIndex.from_product([['a','b'],[0,1]])

MultiIndex(levels=[['a', 'b'], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

## 4. Directly raw

In [36]:
pd.MultiIndex(levels=[['a', 'b'], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex(levels=[['a', 'b'], [0, 1]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

# Multi index dataframe


In [222]:
df = pd.DataFrame(np.random.randint(10,size=(6,2)),
                 index = pd.MultiIndex.from_product([list('ABC'),[0,1]]),
                 columns = ['col1','col2'])
df

Unnamed: 0,Unnamed: 1,col1,col2
A,0,3,1
A,1,8,8
B,0,2,1
B,1,6,1
C,0,6,3
C,1,7,8


In [78]:
df['col1'] # no loc iloc needed for columns

A  0    4
   1    7
B  0    6
   1    4
C  0    2
   1    5
Name: col1, dtype: int32

In [79]:
df.loc['A',0] 

col1    4
col2    5
Name: (A, 0), dtype: int32

In [132]:
df.loc['A',0:1,:] 

Unnamed: 0,Unnamed: 1,col1,col2
A,0,4,5
A,1,7,5


In [83]:
df.iloc[0:3]

Unnamed: 0,Unnamed: 1,col1,col2
A,0,4,5
A,1,7,5
B,0,6,9


In [87]:
df.loc[df['col1']<6]

Unnamed: 0,Unnamed: 1,col1,col2
A,0,4,5
B,1,4,6
C,0,2,5
C,1,5,6


In [94]:
df.iloc[:2,:1] #iloc treats dataframe as numpy 2d array

Unnamed: 0,Unnamed: 1,col1
A,0,4
A,1,7


In [102]:
df.loc[[('A',0),('B',1)]]

Unnamed: 0,Unnamed: 1,col1,col2
A,0,4,5
B,1,4,6


In [156]:
#df.loc['A':'B',0] # does not work
idx = pd.IndexSlice
df.loc[idx['A':'B',0],:]

Unnamed: 0,Unnamed: 1,col1,col2
A,0,4,5
B,0,6,9


In [227]:
#df.stack().loc[idx['A':'B',0:1,:]]
df.stack().loc['A':'B',0:1,:] # not need to create indexslice for series

A  0  col1    3
      col2    1
   1  col1    8
      col2    8
B  0  col1    2
      col2    1
   1  col1    6
      col2    1
dtype: int32

# Sorting

In [228]:
s = pd.Series(np.random.normal(size=6),
                 index = pd.MultiIndex.from_product([['a','b'],[2,0,1]], names = ['char','int'])
              )
s

char  int
a     2     -1.146190
      0      0.039736
      1     -0.180852
b     2     -0.440489
      0      0.167521
      1      1.087358
dtype: float64

In [229]:
s2 = s*2
df=pd.DataFrame({'s1':s,'s2':s2})
df

Unnamed: 0_level_0,Unnamed: 1_level_0,s1,s2
char,int,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,-1.14619,-2.292379
a,0,0.039736,0.079472
a,1,-0.180852,-0.361704
b,2,-0.440489,-0.880977
b,0,0.167521,0.335043
b,1,1.087358,2.174717


In [230]:
s.loc['a':'b',0]

#df.loc['a':'b',0] #does not work
# df['s1'].loc['a':'b',0] #works

char  int
a     0      0.039736
b     0      0.167521
dtype: float64

In [231]:
#s.loc['a':,0:1]#UnsortedIndexError 
s.sort_index().loc['a':,0:1]

char  int
a     0      0.039736
      1     -0.180852
b     0      0.167521
      1      1.087358
dtype: float64

# stacking unstacking

In [238]:
df.unstack().stack().stack()

char  int    
a     0    s1    0.039736
           s2    0.079472
      1    s1   -0.180852
           s2   -0.361704
      2    s1   -1.146190
           s2   -2.292379
b     0    s1    0.167521
           s2    0.335043
      1    s1    1.087358
           s2    2.174717
      2    s1   -0.440489
           s2   -0.880977
dtype: float64

In [237]:
df.unstack().stack().stack().unstack(level=1)

Unnamed: 0_level_0,int,0,1,2
char,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,s1,0.039736,-0.180852,-1.14619
a,s2,0.079472,-0.361704,-2.292379
b,s1,0.167521,1.087358,-0.440489
b,s2,0.335043,2.174717,-0.880977


In [242]:
df.unstack().stack().stack().unstack(level=0)# default is last level

Unnamed: 0_level_0,char,a,b
int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,s1,0.039736,0.167521
0,s2,0.079472,0.335043
1,s1,-0.180852,1.087358
1,s2,-0.361704,2.174717
2,s1,-1.14619,-0.440489
2,s2,-2.292379,-0.880977


In [240]:
df.unstack().unstack()

    int  char
s1  0    a       0.039736
         b       0.167521
    1    a      -0.180852
         b       1.087358
    2    a      -1.146190
         b      -0.440489
s2  0    a       0.079472
         b       0.335043
    1    a      -0.361704
         b       2.174717
    2    a      -2.292379
         b      -0.880977
dtype: float64

# set/reset index

Converts indexes in columns

In [243]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,s1,s2
char,int,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,-1.14619,-2.292379
a,0,0.039736,0.079472
a,1,-0.180852,-0.361704
b,2,-0.440489,-0.880977
b,0,0.167521,0.335043
b,1,1.087358,2.174717


In [246]:
df.reset_index()

Unnamed: 0,char,int,s1,s2
0,a,2,-1.14619,-2.292379
1,a,0,0.039736,0.079472
2,a,1,-0.180852,-0.361704
3,b,2,-0.440489,-0.880977
4,b,0,0.167521,0.335043
5,b,1,1.087358,2.174717


In [247]:
df.reset_index(level=1)

Unnamed: 0_level_0,int,s1,s2
char,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,-1.14619,-2.292379
a,0,0.039736,0.079472
a,1,-0.180852,-0.361704
b,2,-0.440489,-0.880977
b,0,0.167521,0.335043
b,1,1.087358,2.174717


In [249]:
df_flat = df.reset_index(level=[0,1])
df_flat

Unnamed: 0,char,int,s1,s2
0,a,2,-1.14619,-2.292379
1,a,0,0.039736,0.079472
2,a,1,-0.180852,-0.361704
3,b,2,-0.440489,-0.880977
4,b,0,0.167521,0.335043
5,b,1,1.087358,2.174717


## Setting column(s) as index

In [251]:
df_flat.set_index(['char','int']) #existing index goes away

Unnamed: 0_level_0,Unnamed: 1_level_0,s1,s2
char,int,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,-1.14619,-2.292379
a,0,0.039736,0.079472
a,1,-0.180852,-0.361704
b,2,-0.440489,-0.880977
b,0,0.167521,0.335043
b,1,1.087358,2.174717
