<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_multiindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas MultiIndex
- multiple layers/tiers of indices, i.e., a hierarchy
- usually just 1 layer index and 1 layer of columns
- melting is opposite of pivoting
- stack vs. unstack

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [3]:
# sort by date and country, ascending
(
    bigmac
      .sort_values(['date', 'country'], ascending = True)
)

Unnamed: 0,date,country,price_in_us_dollars
609,2010-01-01,Argentina,1.84
610,2010-01-01,Australia,3.98
611,2010-01-01,Brazil,4.76
612,2010-01-01,Britain,3.67
613,2010-01-01,Canada,3.97
...,...,...,...
39,2016-01-01,Ukraine,1.54
40,2016-01-01,United States,4.93
41,2016-01-01,Uruguay,3.74
42,2016-01-01,Venezuela,0.66


## create multi-index with .set_index() method
- best practice is for outer layer/index to have fewer unique values than inner layer/index

In [4]:
# dataset we'll work with
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv'
)

# convert 'Date' column/Series to datetime
bigmac['Date'] = pd.to_datetime(bigmac['Date'])

# column names to lowercase and replace spaces with underscores
bigmac.columns = (
    bigmac.columns
      .str.lower()
      .str.replace(' ', '_')
)

# examine
bigmac.head()

Unnamed: 0,date,country,price_in_us_dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35
3,2016-01-01,Britain,4.22
4,2016-01-01,Canada,4.14


In [5]:
# set date and country as indices
(
    bigmac
      .set_index(['date', 'country'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [6]:
# order matters
(
    bigmac
      .set_index(['country', 'date'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
country,date,Unnamed: 2_level_1
Argentina,2016-01-01,2.39
Australia,2016-01-01,3.74
Brazil,2016-01-01,3.35
Britain,2016-01-01,4.22
Canada,2016-01-01,4.14
...,...,...
Turkey,2010-01-01,3.83
UAE,2010-01-01,2.99
Ukraine,2010-01-01,1.83
United States,2010-01-01,3.58


In [7]:
# but best practice is for outer layer to have fewer unique values than inner layer

# number of unique values per column/Series
bigmac.nunique()

# 12 unique dates --> outer
# 58 unique countries --> inner

date                    12
country                 58
price_in_us_dollars    330
dtype: int64

In [8]:
# new bigmac dataframe with multiindex
bigmac_multiindex = (
    bigmac
      .set_index(['date', 'country'])
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14
...,...,...
2010-01-01,Turkey,3.83
2010-01-01,UAE,2.99
2010-01-01,Ukraine,1.83
2010-01-01,United States,3.58


In [9]:
# sort indices; first outer layer is sorted, then inner within each outer chunk
bigmac_multiindex = (
    bigmac_multiindex
      .sort_index()
)

bigmac_multiindex

Unnamed: 0_level_0,Unnamed: 1_level_0,price_in_us_dollars
date,country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [10]:
# view indices
bigmac_multiindex.index
  # note the tuples

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['date', 'country'], length=652)

In [11]:
# to access a particular value
bigmac_multiindex.loc[('2010-01-01', 'Argentina')]

price_in_us_dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [12]:
bigmac_multiindex.index[0]

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

## .get_level_values() method
- used to get index names in a multiindex; equivalent to df.index in a single-index dataframe
- i.e., extract level values of 1 index in a multiindex
- parameter/argument input is name of index, e.g., 'Date' or integer for position; actual string name is more readable IMO

In [13]:
# import data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],
    index_col = ['Date', 'Country']
)

# sort index
bigmac = bigmac.sort_index()

In [14]:
# bigmac dataframe has two indices; outer is 'Date' and inner is 'Country'
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [15]:
# extract outer index, 'Date'
bigmac.index.get_level_values('Date')

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [16]:
# above is equivalent to:
bigmac.index.get_level_values(0)   # position 0 since it's the outermost/leftmost

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [17]:
# extract country index
bigmac.index.get_level_values('Country')

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

In [18]:
# above is equilvaent to:
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

## .set_names() method

- to change names of a multi-index, akin to column names, except it's the index

```
.set_names()
```



In [19]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [20]:
# current index names
bigmac.index.names

FrozenList(['Date', 'Country'])

In [21]:
# change to 'date' and 'country'
bigmac.index.set_names(['date', 'country'], inplace = True)

In [22]:
# check to make sure names have changed
bigmac.index.names

FrozenList(['date', 'country'])

In [23]:
# change 'date' back to 'Date'
bigmac.index.set_names('Date', level = 'date', inplace = True)

In [24]:
# check to make sure change occurred
bigmac.index.names

FrozenList(['Date', 'country'])

In [25]:
# change 'country' back to 'Country'
bigmac.index.set_names('Country', level = 'country', inplace = True)
bigmac.index.names

FrozenList(['Date', 'Country'])

## .sort_index() method
- ascending parameter, just like .sort_values() method
- level parameter
  - argument is name of index or 0, 1, etc.
  - 0 for outermost layer
  - 1 for one layer in, etc.

In [26]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [27]:
# sort by descending
bigmac.sort_index(ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Vietnam,2.67
2016-01-01,Venezuela,0.66
2016-01-01,Uruguay,3.74
2016-01-01,United States,4.93
2016-01-01,Ukraine,1.54
...,...,...
2010-01-01,Canada,3.97
2010-01-01,Britain,3.67
2010-01-01,Brazil,4.76
2010-01-01,Australia,3.98


In [28]:
# date ASC but country DESC
bigmac.sort_index(ascending = [True, False])   # True refers to Date index; False refers to Country Index

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Uruguay,3.32
2010-01-01,United States,3.58
2010-01-01,Ukraine,1.83
2010-01-01,UAE,2.99
2010-01-01,Turkey,3.83
...,...,...
2016-01-01,Brazil,3.35
2016-01-01,Belgium,4.25
2016-01-01,Austria,3.76
2016-01-01,Australia,3.74


In [29]:
# just sort the date
bigmac.sort_index(level = 'Date')

# code below does the same thing
bigmac.sort_index(level = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [30]:
# just sort the country
bigmac.sort_index(level = 'Country')

# equivalent to
bigmac.sort_index(level = 1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-07-01,Argentina,3.56
2011-07-01,Argentina,4.84
2012-01-01,Argentina,4.64
2012-07-01,Argentina,4.16
...,...,...
2014-01-01,Vietnam,2.84
2014-07-01,Vietnam,2.83
2015-01-01,Vietnam,2.81
2015-07-01,Vietnam,2.75


## Extract rows from multiindex dataframe
- key takeaway is for .loc[ ] accessor, first argument is rows and second argument is columns
- .iloc[ ] accessor isn't affected by multiindex since each row is given an index position in line

In [31]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [32]:
# let's say we want to extract bigmac price on 2010-01-01 in Argentina

# here's a bad way to do it. it works, but it's not desirable.
bigmac.loc['2010-01-01', 'Argentina']

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [33]:
# here's why this is a bad: it's ambiguous.
# above, second argument, 'Argentina' refers to the inner index.
# but second argument could also refer to a column.
# below also works.
bigmac.loc['2010-01-01', 'Price in US Dollars']    # all prices on 2010-01-01, i.e., for all countries

Date        Country       
2010-01-01  Argentina         1.84
            Australia         3.98
            Brazil            4.76
            Britain           3.67
            Canada            3.97
            Chile             3.18
            China             1.83
            Colombia          3.91
            Costa Rica        3.52
            Czech Republic    3.71
            Denmark           5.99
            Egypt             2.38
            Euro area         4.84
            Hong Kong         1.91
            Hungary           3.86
            Indonesia         2.24
            Israel            3.99
            Japan             3.50
            Latvia            3.09
            Lithuania         2.87
            Malaysia          2.08
            Mexico            2.50
            New Zealand       3.61
            Norway            7.02
            Pakistan          2.42
            Peru              2.81
            Philippines       2.21
            Poland          

In [34]:
# better way to do it; have first argument refer to row/index and second argument refer to columns
# parantheses makes clear that it's the first argument, in this case, a tuple
bigmac.loc[('2010-01-01', 'Argentina'), :]    # this is a DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84


In [35]:
# or just:
bigmac.loc[('2010-01-01', 'Argentina')]        # this is a Series

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [36]:
# can repeat column names
bigmac.loc[('2010-01-01', 'Argentina'), ['Price in US Dollars', 'Price in US Dollars']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars,Price in US Dollars
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01,Argentina,1.84,1.84


In [37]:
# need comma for Python to recognize something as a tuple
bigmac.loc[('2010-01-01')]   # not a tuple

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
2010-01-01,Chile,3.18
2010-01-01,China,1.83
2010-01-01,Colombia,3.91
2010-01-01,Costa Rica,3.52
2010-01-01,Czech Republic,3.71


In [38]:
# this is a tuple; it has a comma
bigmac.loc[('2010-01-01',)]

Unnamed: 0_level_0,Price in US Dollars
Country,Unnamed: 1_level_1
Argentina,1.84
Australia,3.98
Brazil,4.76
Britain,3.67
Canada,3.97
Chile,3.18
China,1.83
Colombia,3.91
Costa Rica,3.52
Czech Republic,3.71


In [39]:
# for iloc[], nothing has changed
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


In [40]:
bigmac.iloc[0]

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

In [41]:
bigmac.iloc[[1, 3, 4]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Australia,3.98
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


## .transpose() method on a multiindex dataframe
- rotate 90 degrees so rows become columns and vice versa

In [42]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [43]:
# above, multi-index rows, single index column
# transpose this: rows become columns and columns become rows
# so multi-index columns, and a single row
bigmac.transpose()

# 1 row, 652 columns

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,...,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01
Country,Argentina,Australia,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica,Czech Republic,...,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay,Venezuela,Vietnam
Price in US Dollars,1.84,3.98,4.76,3.67,3.97,3.18,1.83,3.91,3.52,3.71,...,6.44,2.08,3.09,3.41,3.54,1.54,4.93,3.74,0.66,2.67


In [44]:
# above, columns are multi-index: outer layer (0) is Date and inner layer (1) is Country

In [45]:
# extract big mac price in Argentina on Jan 1, 2010
(
    bigmac
      .transpose()
      .loc[('Price in US Dollars',), ('2010-01-01', 'Argentina')]  # first argument is ('Price in US Dollars',) which is a tuple
)

Date,2010-01-01
Country,Argentina
Price in US Dollars,1.84


In [46]:
# all prices on Jan 1, 2010
(
    bigmac
      .transpose()
      .loc[('Price in US Dollars',), ('2010-01-01',)]   # two arguments, each one is a tuple
)

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Country,Argentina,Australia,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica,Czech Republic,...,Sri Lanka,Sweden,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay
Price in US Dollars,1.84,3.98,4.76,3.67,3.97,3.18,1.83,3.91,3.52,3.71,...,1.83,5.51,6.3,2.36,2.11,3.83,2.99,1.83,3.58,3.32


In [47]:
# Sri Lanka thru Ukraine for Jan 1, 2010
(
    bigmac
      .transpose()
      .loc[('Price in US Dollars',), ('2010-01-01', 'Sri Lanka'):('2010-01-01', 'Ukraine')]
)

# ('Price in US Dollars',) --> first argument, indicates row
# ('2010-01-01', 'Sri Lanka') --> part of second argument, indicates starting column
# ('2010-01-01', 'Ukraine') --> part of second argument, go through this column

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Country,Sri Lanka,Sweden,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine
Price in US Dollars,1.83,5.51,6.3,2.36,2.11,3.83,2.99,1.83


In [48]:
# Switzerland for each time point
(
    bigmac
      .transpose()
      .loc[('Price in US Dollars',), bigmac.transpose().columns.get_level_values(1) == 'Switzerland']
)

# had to google the above: https://stackoverflow.com/questions/25189575/pandas-dataframe-select-columns-in-multiindex

Date,2010-01-01,2010-07-01,2011-07-01,2012-01-01,2012-07-01,2013-01-01,2013-07-01,2014-01-01,2014-07-01,2015-01-01,2015-07-01,2016-01-01
Country,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland,Switzerland
Price in US Dollars,6.3,6.19,8.06,6.81,6.56,7.12,6.72,7.14,6.83,7.54,6.82,6.44


In [49]:
# prices for Vietnam
(
    bigmac
      .loc[
          bigmac.index.get_level_values('Country') == 'Vietnam', # rows where value in index position 1 ('Country') is Vietnam
           ('Price in US Dollars',)                              # this column
          ]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2014-01-01,Vietnam,2.84
2014-07-01,Vietnam,2.83
2015-01-01,Vietnam,2.81
2015-07-01,Vietnam,2.75
2016-01-01,Vietnam,2.67


## .swaplevel() method

In [50]:
# load data
bigmac = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/bigmac.csv',
    parse_dates = ['Date'],            # must be a list
    index_col = ['Date', 'Country']
).sort_index()

bigmac

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


In [51]:
# outer index is 'Date' and inner is 'Country'; reverse these
bigmac.swaplevel()

# assign back to bigmac to override bigmac; no inplace = True argument

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Country,Date,Unnamed: 2_level_1
Argentina,2010-01-01,1.84
Australia,2010-01-01,3.98
Brazil,2010-01-01,4.76
Britain,2010-01-01,3.67
Canada,2010-01-01,3.97
...,...,...
Ukraine,2016-01-01,1.54
United States,2016-01-01,4.93
Uruguay,2016-01-01,3.74
Venezuela,2016-01-01,0.66


## .stack() method
- converts columns to a row index
- what's left is one column

In [52]:
# load data
world = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/worldstats.csv',
    index_col = ['country', 'year']
)

world.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0
Arab World,2012,368802611.0,2773270000000.0
Arab World,2011,361031820.0,2497945000000.0


In [53]:
# create a third index where values are 'Population' and 'GDP'
# only column contains values for 'Population' and 'GDP'
world.stack()

# returns a series

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
                  GDP           2.873600e+12
            2013  Population    3.765043e+08
                                    ...     
Zimbabwe    1962  GDP           1.117602e+09
            1961  Population    3.876638e+06
                  GDP           1.096647e+09
            1960  Population    3.752390e+06
                  GDP           1.052990e+09
Length: 22422, dtype: float64

In [54]:
# world stacked and as dataframe
world_stacked = (
    world
      .stack()
      .to_frame()
)

world_stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,Population,3.920223e+08
Arab World,2015,GDP,2.530102e+12
Arab World,2014,Population,3.842226e+08
Arab World,2014,GDP,2.873600e+12
Arab World,2013,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962,GDP,1.117602e+09
Zimbabwe,1961,Population,3.876638e+06
Zimbabwe,1961,GDP,1.096647e+09
Zimbabwe,1960,Population,3.752390e+06


In [55]:
# rename index
(
    world_stacked
      .index
      .set_names(['country', 'year', 'variable'], inplace = True)
)

world_stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,variable,Unnamed: 3_level_1
Arab World,2015,Population,3.920223e+08
Arab World,2015,GDP,2.530102e+12
Arab World,2014,Population,3.842226e+08
Arab World,2014,GDP,2.873600e+12
Arab World,2013,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962,GDP,1.117602e+09
Zimbabwe,1961,Population,3.876638e+06
Zimbabwe,1961,GDP,1.096647e+09
Zimbabwe,1960,Population,3.752390e+06


In [56]:
# rename column from 0 to value
world_stacked = (
    world_stacked
      .rename(columns = {0: 'value'})
)

world_stacked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
country,year,variable,Unnamed: 3_level_1
Arab World,2015,Population,3.920223e+08
Arab World,2015,GDP,2.530102e+12
Arab World,2014,Population,3.842226e+08
Arab World,2014,GDP,2.873600e+12
Arab World,2013,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962,GDP,1.117602e+09
Zimbabwe,1961,Population,3.876638e+06
Zimbabwe,1961,GDP,1.096647e+09
Zimbabwe,1960,Population,3.752390e+06


In [57]:
# Zimbabwe GDP and Population in 1962
(
    world_stacked
      .loc[
          (
            world_stacked.index.get_level_values('country') == 'Zimbabwe',
            world_stacked.index.get_level_values('year') == 1962,
            world_stacked.index.get_level_values('variable').isin(['GDP', 'Population'])
          ),
           :
      ]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
country,year,variable,Unnamed: 3_level_1
Zimbabwe,1962,Population,4006262.0
Zimbabwe,1962,GDP,1117602000.0


In [58]:
# Zimbabwe population over the years
#(
 #   world_stacked
  #    .loc[
   #       (
    #         world_stacked.index.get_level_values('country') == 'Zimbabwe', 
     #        ,
      #       world_stacked.index.get_level_values('variable') == 'Population'
       #   )
      #]
#)

## .unstack() method
- reverses .stack() method
- can specify which row index to move to columns and fill value for what to place in cell if value is missing

In [59]:
world.head()

# this DataFrame is a multiindex in the rows

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0
Arab World,2012,368802611.0,2773270000000.0
Arab World,2011,361031820.0,2497945000000.0


In [60]:
# stack world such that it's one column and three indices
# Population/GDP moved to innermost layer in row index
(
    world
      .stack()
      .to_frame()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,Population,3.920223e+08
Arab World,2015,GDP,2.530102e+12
Arab World,2014,Population,3.842226e+08
Arab World,2014,GDP,2.873600e+12
Arab World,2013,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962,GDP,1.117602e+09
Zimbabwe,1961,Population,3.876638e+06
Zimbabwe,1961,GDP,1.096647e+09
Zimbabwe,1960,Population,3.752390e+06


In [61]:
# unstack it such that it's back to how it was originally
(
    world
      .stack()
      #.to_frame()    # this messes up the reversal
      .unstack()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,5.377778e+08
Afghanistan,1961,9164945.0,5.488889e+08
Afghanistan,1962,9343772.0,5.466667e+08
Afghanistan,1963,9531555.0,7.511112e+08
Afghanistan,1964,9728645.0,8.000000e+08
...,...,...,...
Zimbabwe,2011,14255592.0,1.095623e+10
Zimbabwe,2012,14565482.0,1.239272e+10
Zimbabwe,2013,14898092.0,1.349023e+10
Zimbabwe,2014,15245855.0,1.419691e+10


In [62]:
# keep unstacking
(
    world
      .stack()        # Population and GDP become the innermost row index
      #.to_frame()    # this messes up the reversal
      .unstack()      # Population and GDP are now columns
      .unstack()      # year is a column index; innermost column index
      .unstack()      # keep going
      .unstack()
)

Unnamed: 0_level_0,country,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Arab World,Argentina,Armenia,Aruba,...,Uzbekistan,Vanuatu,"Venezuela, RB",Vietnam,Virgin Islands (U.S.),West Bank and Gaza,World,"Yemen, Rep.",Zambia,Zimbabwe
Unnamed: 0_level_1,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Population,1960,8.994793e+06,,1.112489e+07,,,,,,,,...,,,8.146845e+06,,32000.0,,3.035056e+09,,3.049586e+06,3.752390e+06
Population,1961,9.164945e+06,,1.140486e+07,,,,,,,,...,,,8.461684e+06,,34100.0,,3.076121e+09,,3.142848e+06,3.876638e+06
Population,1962,9.343772e+06,,1.169015e+07,,,,,2.128768e+07,,,...,,,8.790590e+06,,36300.0,,3.129064e+09,,3.240664e+06,4.006262e+06
Population,1963,9.531555e+06,,1.198513e+07,,,,,2.162184e+07,,,...,,,9.130346e+06,,38700.0,,3.193947e+09,,3.342894e+06,4.140804e+06
Population,1964,9.728645e+06,,1.229597e+07,,,,,2.195393e+07,,,...,,,9.476255e+06,,41300.0,,3.259355e+09,,3.449266e+06,4.279561e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GDP,2011,1.793024e+10,1.289087e+10,2.000131e+11,3.427236e+09,1.041159e+11,1.129918e+09,2.497945e+12,5.578902e+11,1.014211e+10,2.584464e+09,...,4.532432e+10,7.921497e+08,3.164822e+11,1.355395e+11,,1.045985e+10,7.284314e+13,3.107886e+10,2.345952e+10,1.095623e+10
GDP,2012,2.053654e+10,1.231978e+10,2.090474e+11,3.146178e+09,1.153984e+11,1.204713e+09,2.773270e+12,6.043785e+11,1.061932e+10,,...,5.118344e+10,7.817029e+08,3.812862e+11,1.558200e+11,,1.127940e+10,7.442836e+13,3.207477e+10,2.550306e+10,1.239272e+10
GDP,2013,2.004633e+10,1.278103e+10,2.097035e+11,3.249101e+09,1.249121e+11,1.200588e+09,2.846994e+12,6.239320e+11,1.112147e+10,,...,5.679566e+10,8.017876e+08,3.713366e+11,1.712220e+11,,1.247600e+10,7.643132e+13,3.595450e+10,2.804552e+10,1.349023e+10
GDP,2014,2.005019e+10,1.327796e+10,2.135185e+11,,1.267751e+11,1.220976e+09,2.873600e+12,5.480549e+11,1.164444e+10,,...,6.313285e+10,8.149546e+08,,1.862047e+11,,1.271560e+10,7.810634e+13,,2.713464e+10,1.419691e+10


In [63]:
world_stacked.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
country,year,variable,Unnamed: 3_level_1
Arab World,2015,Population,392022300.0
Arab World,2015,GDP,2530102000000.0
Arab World,2014,Population,384222600.0
Arab World,2014,GDP,2873600000000.0
Arab World,2013,Population,376504300.0


In [64]:
# make 'Population' and 'GDP' the columns
world_stacked.unstack('variable').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,variable,Population,GDP
country,year,Unnamed: 2_level_2,Unnamed: 3_level_2
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0
Afghanistan,1963,9531555.0,751111200.0
Afghanistan,1964,9728645.0,800000000.0


In [65]:
# make year the column
world_stacked.unstack('year').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
Unnamed: 0_level_1,year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Afghanistan,Population,8994793.0,9164945.0,9343772.0,9531555.0,9728645.0,9935358.0,10148840.0,10368600.0,10599790.0,10849510.0,...,25183620.0,25877540.0,26528740.0,27207290.0,27962210.0,28809170.0,29726800.0,30682500.0,31627510.0,32526560.0
Afghanistan,GDP,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,1373333000.0,1408889000.0,...,7057598000.0,9843842000.0,10190530000.0,12486940000.0,15936800000.0,17930240000.0,20536540000.0,20046330000.0,20050190000.0,19199440000.0
Albania,Population,,,,,,,,,,,...,2992547.0,2970017.0,2947314.0,2927519.0,2913021.0,2904780.0,2900247.0,2896652.0,2893654.0,2889167.0
Albania,GDP,,,,,,,,,,,...,8992642000.0,10701010000.0,12881350000.0,12044210000.0,11926950000.0,12890870000.0,12319780000.0,12781030000.0,13277960000.0,11455600000.0
Algeria,Population,11124890.0,11404860.0,11690150.0,11985130.0,12295970.0,12626950.0,12980270.0,13354200.0,13744380.0,14144440.0,...,33749330.0,34261970.0,34811060.0,35401790.0,36036160.0,36717130.0,37439430.0,38186140.0,38934330.0,39666520.0


In [66]:
# make year and varaible the columns
world_stacked.unstack(['year', 'variable'])

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
year,2015,2015,2014,2014,2013,2013,2012,2012,2011,2011,...,1964,1964,1963,1963,1962,1962,1961,1961,1960,1960
variable,Population,GDP,Population,GDP,Population,GDP,Population,GDP,Population,GDP,...,Population,GDP,Population,GDP,Population,GDP,Population,GDP,Population,GDP
country,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
Afghanistan,3.252656e+07,1.919944e+10,3.162751e+07,2.005019e+10,3.068250e+07,2.004633e+10,2.972680e+07,2.053654e+10,2.880917e+07,1.793024e+10,...,9.728645e+06,8.000000e+08,9.531555e+06,7.511112e+08,9.343772e+06,5.466667e+08,9.164945e+06,5.488889e+08,8.994793e+06,5.377778e+08
Albania,2.889167e+06,1.145560e+10,2.893654e+06,1.327796e+10,2.896652e+06,1.278103e+10,2.900247e+06,1.231978e+10,2.904780e+06,1.289087e+10,...,,,,,,,,,,
Algeria,3.966652e+07,1.668386e+11,3.893433e+07,2.135185e+11,3.818614e+07,2.097035e+11,3.743943e+07,2.090474e+11,3.671713e+07,2.000131e+11,...,1.229597e+07,2.909340e+09,1.198513e+07,2.703004e+09,1.169015e+07,2.001461e+09,1.140486e+07,2.434767e+09,1.112489e+07,2.723638e+09
Andorra,,,,,7.590200e+04,3.249101e+09,7.931600e+04,3.146178e+09,8.232600e+04,3.427236e+09,...,,,,,,,,,,
Angola,2.502197e+07,1.026431e+11,2.422752e+07,1.267751e+11,2.344820e+07,1.249121e+11,2.268563e+07,1.153984e+11,2.194230e+07,1.041159e+11,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
West Bank and Gaza,4.422143e+06,1.267740e+10,4.294682e+06,1.271560e+10,4.169506e+06,1.247600e+10,4.046901e+06,1.127940e+10,3.927051e+06,1.045985e+10,...,,,,,,,,,,
World,7.346633e+09,7.343364e+13,7.260780e+09,7.810634e+13,7.176092e+09,7.643132e+13,7.089452e+09,7.442836e+13,7.006908e+09,7.284314e+13,...,3.259355e+09,1.799675e+12,3.193947e+09,1.638187e+12,3.129064e+09,1.524573e+12,3.076121e+09,1.420440e+12,3.035056e+09,1.364643e+12
"Yemen, Rep.",,,,,2.553322e+07,3.595450e+10,2.488279e+07,3.207477e+10,2.423494e+07,3.107886e+10,...,,,,,,,,,,
Zambia,1.621177e+07,2.120156e+10,1.572134e+07,2.713464e+10,1.524609e+07,2.804552e+10,1.478658e+07,2.550306e+10,1.434353e+07,2.345952e+10,...,3.449266e+06,8.226397e+08,3.342894e+06,7.043397e+08,3.240664e+06,6.792797e+08,3.142848e+06,6.823597e+08,3.049586e+06,6.987397e+08


## .pivot() method

In [69]:
# load data
sales = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/salesmen.csv'
)

# convert Date to datetime
sales['Date'] = pd.to_datetime(sales['Date'])

# convert Salesman to category
sales['Salesman'] = sales['Salesman'].astype('category')

sales.head()

Unnamed: 0,Date,Salesman,Revenue
0,2016-01-01,Bob,7172
1,2016-01-02,Bob,6362
2,2016-01-03,Bob,5982
3,2016-01-04,Bob,7917
4,2016-01-05,Bob,7837


In [92]:
# sales DataFrame is in long form: one row per Date per Salesman
# convert to wide form: one row per Date, columns are Salesman, and entries are revenue by that Salesman on that Date
sales_wide = (
    sales
      .pivot(
          index = 'Date',
          columns = 'Salesman',
          values = 'Revenue'
      )
      .reset_index()
)

sales_wide.head()

# Salesman is name of column axis

Salesman,Date,Bob,Dave,Jeb,Oscar,Ronald
0,2016-01-01,7172,1864,4430,5250,2639
1,2016-01-02,6362,8278,8026,8661,4951
2,2016-01-03,5982,4226,5188,7075,2703
3,2016-01-04,7917,3868,3144,2524,4258
4,2016-01-05,7837,2287,938,2793,7771


In [95]:
# get ride of 'Salesman' name of columns
(
    sales_wide
      .columns
      .rename(None, inplace = True)
)

sales_wide

Unnamed: 0,Date,Bob,Dave,Jeb,Oscar,Ronald
0,2016-01-01,7172,1864,4430,5250,2639
1,2016-01-02,6362,8278,8026,8661,4951
2,2016-01-03,5982,4226,5188,7075,2703
3,2016-01-04,7917,3868,3144,2524,4258
4,2016-01-05,7837,2287,938,2793,7771
...,...,...,...,...,...,...
361,2016-12-27,2045,2843,6666,835,2981
362,2016-12-28,100,8888,1243,3073,6129
363,2016-12-29,4115,9490,3498,6424,7662
364,2016-12-30,2577,3594,8858,7088,2570


## .pivot_table() method to get aggregate summary

In [105]:
# load data
foods = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/foods.csv'
)

foods.head()

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


In [107]:
# average spend per Gender
(
    foods
      .pivot_table(
          values = 'Spend', 
          aggfunc = 'mean',
          index = 'Gender'
      )
      .reset_index()
)

Unnamed: 0,Gender,Spend
0,Female,50.709629
1,Male,49.397623


In [111]:
# average spend per Gender for each Item
(
    foods
      .pivot_table(
          values = 'Spend',
          aggfunc = 'mean',
          index = ['Gender', 'Item']
      )
      .reset_index()
      .sort_values('Spend', ascending = False)
)

Unnamed: 0,Gender,Item,Spend
11,Male,Sushi,55.614384
2,Female,Chalupa,54.635
10,Male,Ice Cream,51.096
5,Female,Sushi,50.355699
1,Female,Burrito,50.092
0,Female,Burger,49.930488
3,Female,Donut,49.926316
4,Female,Ice Cream,49.788519
6,Male,Burger,49.613919
8,Male,Chalupa,49.186761


In [121]:
# total spend by Gender and City
(
    foods
      .pivot_table(
          values = 'Spend',
          aggfunc = 'sum',
          index = 'City', 
          columns = 'Gender'
      )
)

Gender,Female,Male
City,Unnamed: 1_level_1,Unnamed: 2_level_1
New York,7543.26,8266.31
Philadelphia,9632.69,8201.85
Stamford,8787.38,7637.88


In [122]:
# aggfunc
# sum
# mean
# count
# max
# min

In [126]:
# highest daily spend for each item by city
(
    foods[foods['Frequency'] == 'Daily']
      .pivot_table(
          values = 'Spend',
          aggfunc = 'max',
          index = 'Item',
          columns = 'City'
      )
)

City,New York,Philadelphia,Stamford
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Burger,87.3,91.12,75.51
Burrito,98.04,91.58,81.2
Chalupa,43.19,92.25,99.87
Donut,95.63,92.25,77.71
Ice Cream,97.83,79.18,78.24
Sushi,83.07,99.02,86.94


In [133]:
# cheapest item by City and Frequency
(
    foods
      .pivot_table(
          values = 'Spend', 
          aggfunc = 'min',
          columns = 'City',
          index = ['Frequency', 'Item']
      )
      #.loc[('Daily',),:]   # for filtering
)

Unnamed: 0_level_0,City,New York,Philadelphia,Stamford
Frequency,Item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Daily,Burger,10.87,6.77,30.55
Daily,Burrito,10.83,27.39,1.18
Daily,Chalupa,25.73,23.49,10.56
Daily,Donut,4.39,19.79,1.68
Daily,Ice Cream,22.5,6.73,14.64
Daily,Sushi,6.35,2.01,25.17
Monthly,Burger,30.74,2.65,8.95
Monthly,Burrito,16.36,1.04,7.35
Monthly,Chalupa,28.56,17.65,14.88
Monthly,Donut,20.7,12.58,9.07


## pd.melt()
- convert DataFrame from wide to long

In [136]:
# load data
quarters = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/quarters.csv'
)

quarters.head()

Unnamed: 0,Salesman,Q1,Q2,Q3,Q4
0,Boris,602908,233879,354479,32704
1,Bob,43790,514863,297151,544493
2,Tommy,392668,113579,430882,247231
3,Travis,834663,266785,749238,570524
4,Donald,580935,411379,110390,651572


In [145]:
# convert quarters DataFrame from wide to long
# one row per Salesman per quarter (Q1, Q2, etc.)
# currently one row per Salesman

(
    quarters
      .melt(
          id_vars = 'Salesman',                       # preserve this column
          value_vars = ['Q1', 'Q2', 'Q3', 'Q4'],      # ['Q1', 'Q2', ...] become entries in variable column; values go in 'values' column
          var_name = 'Quarter',                       # rename 'variable'; defaults to same as R
          value_name = 'Revenue'                      # rename 'value'; defaults to same as R
      )
      .sort_values(by = ['Salesman', 'Quarter'], ascending = True)
      .reset_index(drop = True)
)

Unnamed: 0,Salesman,Quarter,Revenue
0,Bob,Q1,43790
1,Bob,Q2,514863
2,Bob,Q3,297151
3,Bob,Q4,544493
4,Boris,Q1,602908
5,Boris,Q2,233879
6,Boris,Q3,354479
7,Boris,Q4,32704
8,Donald,Q1,580935
9,Donald,Q2,411379


In [146]:
## that's it for this section