<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_multiindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas MultiIndex
- multiple layers/tiers of indices, i.e., a hierarchy
- usually just 1 layer index and 1 layer of columns
- melting is opposite of pivoting
- stack vs. unstack

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [10]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [14]:
# sort by date and country, ascending
(
    bigmac
      .sort_values(['date', 'country'], ascending = True)
)

Unnamed: 0,date,country,price_in_us_dollars
609,2010-01-01,Argentina,1.84
610,2010-01-01,Australia,3.98
611,2010-01-01,Brazil,4.76
612,2010-01-01,Britain,3.67
613,2010-01-01,Canada,3.97
...,...,...,...
39,2016-01-01,Ukraine,1.54
40,2016-01-01,United States,4.93
41,2016-01-01,Uruguay,3.74
42,2016-01-01,Venezuela,0.66


## create multi-index with .set_index() method
- best practice is for outer layer/index to have fewer unique values than inner layer/index

In [15]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [17]:
# set date and country as indices
(
    bigmac
      .set_index(['date', 'country'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [18]:
# order matters
(
    bigmac
      .set_index(['country', 'date'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
country,date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Brazil,2016-01-01,3.35
Britain,2016-01-01,4.22
Canada,2016-01-01,4.14
...,...,...
Turkey,2010-01-01,3.83
UAE,2010-01-01,2.99
Ukraine,2010-01-01,1.83
United States,2010-01-01,3.58


In [19]:
# but best practice is for outer layer to have fewer unique values than inner layer

# number of unique values per column/Series
bigmac.nunique()

# 12 unique dates --> outer
# 58 unique countries --> inner

date                    12
country                 58
price_in_us_dollars    330
dtype: int64

In [20]:
# new bigmac dataframe with multiindex
bigmac_multiindex = (
    bigmac
      .set_index(['date', 'country'])
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [22]:
# sort indices; first outer layer is sorted, then inner within each outer chunk
bigmac_multiindex = (
    bigmac_multiindex
      .sort_index()
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [23]:
# view indices
bigmac_multiindex.index
  # note the tuples

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['date', 'country'], length=652)

In [24]:
# to access a particular value
bigmac_multiindex.loc[('2010-01-01', 'Argentina')]

price_in_us_dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [26]:
bigmac_multiindex.index[0]

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

## .get_level_values() method
- used to get index names in a multiindex; equivalent to df.index in a single-index dataframe
- i.e., extract level values of 1 index in a multiindex
- parameter/argument input is name of index, e.g., 'Date' or integer for position; actual string name is more readable IMO

In [30]:
# import data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],
    index_col = ['Date', 'Country']
)

# sort index
bigmac = bigmac.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [31]:
# bigmac dataframe has two indices; outer is 'Date' and inner is 'Country'
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [32]:
# extract outer index, 'Date'
bigmac.index.get_level_values('Date')

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [33]:
# above is equivalent to:
bigmac.index.get_level_values(0)   # position 0 since it's the outermost/leftmost

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [35]:
# extract country index
bigmac.index.get_level_values('Country')

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

In [36]:
# above is equilvaent to:
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

## .set_names() method

- ...
- ...

```
.set_names()
```



In [39]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66
