<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_multiindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas MultiIndex
- multiple layers/tiers of indices, i.e., a hierarchy
- usually just 1 layer index and 1 layer of columns
- melting is opposite of pivoting
- stack vs. unstack

In [2]:
# libraries needed
import numpy as np
import pandas as pd

In [3]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [4]:
# sort by date and country, ascending
(
    bigmac
      .sort_values(['date', 'country'], ascending = True)
)

Unnamed: 0,date,country,price_in_us_dollars
609,2010-01-01,Argentina,1.84
610,2010-01-01,Australia,3.98
611,2010-01-01,Brazil,4.76
612,2010-01-01,Britain,3.67
613,2010-01-01,Canada,3.97
...,...,...,...
39,2016-01-01,Ukraine,1.54
40,2016-01-01,United States,4.93
41,2016-01-01,Uruguay,3.74
42,2016-01-01,Venezuela,0.66


## create multi-index with .set_index() method
- best practice is for outer layer/index to have fewer unique values than inner layer/index

In [5]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [6]:
# set date and country as indices
(
    bigmac
      .set_index(['date', 'country'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [7]:
# order matters
(
    bigmac
      .set_index(['country', 'date'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
country,date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Brazil,2016-01-01,3.35
Britain,2016-01-01,4.22
Canada,2016-01-01,4.14
...,...,...
Turkey,2010-01-01,3.83
UAE,2010-01-01,2.99
Ukraine,2010-01-01,1.83
United States,2010-01-01,3.58


In [8]:
# but best practice is for outer layer to have fewer unique values than inner layer

# number of unique values per column/Series
bigmac.nunique()

# 12 unique dates --> outer
# 58 unique countries --> inner

date                    12
country                 58
price_in_us_dollars    330
dtype: int64

In [9]:
# new bigmac dataframe with multiindex
bigmac_multiindex = (
    bigmac
      .set_index(['date', 'country'])
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [10]:
# sort indices; first outer layer is sorted, then inner within each outer chunk
bigmac_multiindex = (
    bigmac_multiindex
      .sort_index()
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [11]:
# view indices
bigmac_multiindex.index
  # note the tuples

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['date', 'country'], length=652)

In [12]:
# to access a particular value
bigmac_multiindex.loc[('2010-01-01', 'Argentina')]

price_in_us_dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [13]:
bigmac_multiindex.index[0]

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

## .get_level_values() method
- used to get index names in a multiindex; equivalent to df.index in a single-index dataframe
- i.e., extract level values of 1 index in a multiindex
- parameter/argument input is name of index, e.g., 'Date' or integer for position; actual string name is more readable IMO

In [14]:
# import data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],
    index_col = ['Date', 'Country']
)

# sort index
bigmac = bigmac.sort_index()

In [15]:
# bigmac dataframe has two indices; outer is 'Date' and inner is 'Country'
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [16]:
# extract outer index, 'Date'
bigmac.index.get_level_values('Date')

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [17]:
# above is equivalent to:
bigmac.index.get_level_values(0)   # position 0 since it's the outermost/leftmost

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [18]:
# extract country index
bigmac.index.get_level_values('Country')

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

In [19]:
# above is equilvaent to:
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

## .set_names() method

- to change names of a multi-index, akin to column names, except it's the index

```
.set_names()
```



In [20]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [21]:
# current index names
bigmac.index.names

FrozenList(['Date', 'Country'])

In [22]:
# change to 'date' and 'country'
bigmac.index.set_names(['date', 'country'], inplace = True)

In [23]:
# check to make sure names have changed
bigmac.index.names

FrozenList(['date', 'country'])

In [24]:
# change 'date' back to 'Date'
bigmac.index.set_names('Date', level = 'date', inplace = True)

In [25]:
# check to make sure change occurred
bigmac.index.names

FrozenList(['Date', 'country'])

In [26]:
# change 'country' back to 'Country'
bigmac.index.set_names('Country', level = 'country', inplace = True)
bigmac.index.names

FrozenList(['Date', 'Country'])

## .sort_index() method
- ascending parameter, just like .sort_values() method
- level parameter
  - argument is name of index or 0, 1, etc.
  - 0 for outermost layer
  - 1 for one layer in, etc.

In [27]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [28]:
# sort by descending
bigmac.sort_index(ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Vietnam,2.67
2016-01-01,Venezuela,0.66
2016-01-01,Uruguay,3.74
2016-01-01,United States,4.93
2016-01-01,Ukraine,1.54
...,...,...
2010-01-01,Canada,3.97
2010-01-01,Britain,3.67
2010-01-01,Brazil,4.76
2010-01-01,Australia,3.98


In [29]:
# date ASC but country DESC
bigmac.sort_index(ascending = [True, False])   # True refers to Date index; False refers to Country Index

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Uruguay,3.32
2010-01-01,United States,3.58
2010-01-01,Ukraine,1.83
2010-01-01,UAE,2.99
2010-01-01,Turkey,3.83
...,...,...
2016-01-01,Brazil,3.35
2016-01-01,Belgium,4.25
2016-01-01,Austria,3.76
2016-01-01,Australia,3.74


In [30]:
# just sort the date
bigmac.sort_index(level = 'Date')

# code below does the same thing
bigmac.sort_index(level = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [31]:
# just sort the country
bigmac.sort_index(level = 'Country')

# equivalent to
bigmac.sort_index(level = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-07-01,Argentina,3.56
2011-07-01,Argentina,4.84
2012-01-01,Argentina,4.64
2012-07-01,Argentina,4.16
...,...,...
2014-01-01,Vietnam,2.84
2014-07-01,Vietnam,2.83
2015-01-01,Vietnam,2.81
2015-07-01,Vietnam,2.75


## Extract rows from multiindex dataframe
- key takeaway is for .loc[ ] accessor, first argument is rows and second argument is columns
- .iloc[ ] accessor isn't affected by multiindex since each row is given an index position in line

In [32]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [34]:
# let's say we want to extract bigmac price on 2010-01-01 in Argentina

# here's a bad way to do it. it works, but it's not desirable.
bigmac.loc['2010-01-01', 'Argentina']

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [36]:
# here's why this is a bad: it's ambiguous.
# above, second argument, 'Argentina' refers to the inner index.
# but second argument could also refer to a column.
# below also works.
bigmac.loc['2010-01-01', 'Price in US Dollars']    # all prices on 2010-01-01, i.e., for all countries

Date        Country       
2010-01-01  Argentina         1.84
            Australia         3.98
            Brazil            4.76
            Britain           3.67
            Canada            3.97
            Chile             3.18
            China             1.83
            Colombia          3.91
            Costa Rica        3.52
            Czech Republic    3.71
            Denmark           5.99
            Egypt             2.38
            Euro area         4.84
            Hong Kong         1.91
            Hungary           3.86
            Indonesia         2.24
            Israel            3.99
            Japan             3.50
            Latvia            3.09
            Lithuania         2.87
            Malaysia          2.08
            Mexico            2.50
            New Zealand       3.61
            Norway            7.02
            Pakistan          2.42
            Peru              2.81
            Philippines       2.21
            Poland          

In [41]:
# better way to do it; have first argument refer to row/index and second argument refer to columns
# parantheses makes clear that it's the first argument, in this case, a tuple
bigmac.loc[('2010-01-01', 'Argentina'), :]    # this is a DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84


In [43]:
# or just:
bigmac.loc[('2010-01-01', 'Argentina')]        # this is a Series

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [44]:
# can repeat column names
bigmac.loc[('2010-01-01', 'Argentina'), ['Price in US Dollars', 'Price in US Dollars']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars,Price in US Dollars
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,Argentina,1.84,1.84


In [45]:
# need comma for Python to recognize something as a tuple
bigmac.loc[('2010-01-01')]   # not a tuple

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
2010-01-01,Chile,3.18
2010-01-01,China,1.83
2010-01-01,Colombia,3.91
2010-01-01,Costa Rica,3.52
2010-01-01,Czech Republic,3.71


In [46]:
# this is a tuple; it has a comma
bigmac.loc[('2010-01-01',)]

Unnamed: 0_level_0,Price in US Dollars
Country,Unnamed: 1_level_1
Argentina,1.84
Australia,3.98
Brazil,4.76
Britain,3.67
Canada,3.97
Chile,3.18
China,1.83
Colombia,3.91
Costa Rica,3.52
Czech Republic,3.71


In [47]:
# for iloc[], nothing has changed
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


In [48]:
bigmac.iloc[0]

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [51]:
bigmac.iloc[[1, 3, 4]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Australia,3.98
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


## .transpose() method on a multiindex dataframe