In [1]:
import pandas as pd
import numpy as np

In [2]:
index = [('California',2010), ('California',2020), ('New York',2010),
         ('New York',2020), ('Texas',2010), ('Texas',2020)]
populations = [37243956,39538223,
               19378102, 20201249,
               25145561, 29145505]
pop = pd.Series(populations, index=index)
print(pop)

# 使用元組進行切片
pop[('California', 2020):('Texas', 2010)]

(California, 2010)    37243956
(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
(Texas, 2020)         29145505
dtype: int64


(California, 2020)    39538223
(New York, 2010)      19378102
(New York, 2020)      20201249
(Texas, 2010)         25145561
dtype: int64

In [3]:
print(pop[[i for i in pop.index if i[1] == 2010]])

(California, 2010)    37243956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64


In [4]:
# Pandas MultiIndex 以元組為例
index = pd.MultiIndex.from_tuples(index)

# 使用 MultiIndex 重新建立索引 (空白代表與上方項目相同)
pop = pop.reindex(index)
print(pop)

# 現在可以只使用切片 就用年份當條件取資料
print(pop[:, 2020])

California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64
California    39538223
New York      20201249
Texas         29145505
dtype: int64


In [5]:
# 使用 .unstack 將Series 轉換為 DataFrame
pop_df = pop.unstack()
print(pop_df)

# 使用 .stack 將DataFrame 轉換為 Series
print(pop_df.stack())

                2010      2020
California  37243956  39538223
New York    19378102  20201249
Texas       25145561  29145505
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


In [6]:
# 使用多重索引 然後再新增一個欄位
pop_df = pd.DataFrame({'total':pop,
                      'under18':[9284094, 8898092,
                                 4318033, 4181528,
                                 6879014, 7432474]})
print(pop_df)

# 可以使用通用函式後 然後將多重索引 轉為欄標籤
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())



                    total  under18
California 2010  37243956  9284094
           2020  39538223  8898092
New York   2010  19378102  4318033
           2020  20201249  4181528
Texas      2010  25145561  6879014
           2020  29145505  7432474
                2010      2020
California  0.249278  0.225050
New York    0.222831  0.206994
Texas       0.273568  0.255013


In [7]:
# 傳遞1個2個或多個索引陣列的串列給建構子
df = pd.DataFrame(np.random.rand(4,2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
print(df)

# Pandas 會自動辨識以元組做為鍵的字典 並設定為多重索引
data = {('California', 2010): 37243956,
        ('California', 2020): 39538223,
        ('New York', 2010): 19378102,
        ('New York', 2020): 20201249,
        ('Texas', 2010): 25145561,
        ('Texas', 2020): 29145505}
print(pd.Series(data))

        data1     data2
a 1  0.167804  0.646344
  2  0.819423  0.565153
b 1  0.836086  0.008126
  2  0.255162  0.280514
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


In [10]:
# 使用陣列串列建構多重索引
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'],[1, 2, 1, 2]]))

# 使用元祖串列建構多重索引
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]))

# 使用笛卡兒積建構多重索引
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))

# 使用 levels (索引串列的串列) 和 codes (參用levels 的串列) 建構多重索引
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [11]:
pop.index.names = ['state', 'year']
print(pop)

state       year
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


In [18]:
# 建立列與欄的多重索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], 
                                   names=['year', 'visit'])

columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],
                                      ['HR', 'Temp']],
                                      names=['subject', 'type'])

# 仿製一些醫學資料
data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37

# 建立DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)

# 取得單一病患的資料
print(health_data['Guido'])

subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      30.0  35.6  42.0  36.3  40.0  37.6
     2      49.0  36.9  33.0  36.3  35.0  37.2
2014 1      50.0  35.5  49.0  38.3  23.0  36.2
     2      32.0  36.8  46.0  36.4  41.0  35.6
type          HR  Temp
year visit            
2013 1      42.0  36.3
     2      33.0  36.3
2014 1      49.0  38.3
     2      46.0  36.4


In [26]:
print(pop)

# 使用多個項目索引存取單一元素
print(pop['California', 2010])

# 使用外圍索引 會包含內部索引
print(pop['California'])

# 也可使用切片 但若外圍索引沒有按照字母序排列會報錯 需要先排序外圍索引
print(pop.loc['California':'New York'])

# 排序後的索引 可以對內部索引進行切片
print(pop[:, 2010])

# 可進行布林遮罩選取
print(pop[pop > 22000000])

# 可透過fancy index 選取
print(pop[['California', 'Texas']])

state       year
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64
37243956
year
2010    37243956
2020    39538223
dtype: int64
state       year
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
dtype: int64
state
California    37243956
New York      19378102
Texas         25145561
dtype: int64
state       year
California  2010    37243956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64
state       year
California  2010    37243956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64


In [27]:
print(health_data)

# Series 的多項目存取 在Pandas 中會以欄位為主
print(health_data['Guido', 'HR'])

# 列使用loc / iloc
print(health_data.iloc[:2, :2])

# loc / iloc 可以使用多重索引的元組
print(health_data.loc[:, ('Bob', 'HR')])

# 但最好的方法 還是使用 Pandas 內建的切片器
idx = pd.IndexSlice
print(health_data.loc[idx[:, 1], idx[:,'HR']])

subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      30.0  35.6  42.0  36.3  40.0  37.6
     2      49.0  36.9  33.0  36.3  35.0  37.2
2014 1      50.0  35.5  49.0  38.3  23.0  36.2
     2      32.0  36.8  46.0  36.4  41.0  35.6
year  visit
2013  1        42.0
      2        33.0
2014  1        49.0
      2        46.0
Name: (Guido, HR), dtype: float64
subject      Bob      
type          HR  Temp
year visit            
2013 1      30.0  35.6
     2      49.0  36.9
year  visit
2013  1        30.0
      2        49.0
2014  1        50.0
      2        32.0
Name: (Bob, HR), dtype: float64
subject      Bob Guido   Sue
type          HR    HR    HR
year visit                  
2013 1      30.0  42.0  40.0
2014 1      50.0  49.0  23.0


In [28]:
# 建立一個未依照辭典序排序的資料集
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
print(data)

char  int
a     1      0.933669
      2      0.773842
c     1      0.842450
      2      0.349751
b     1      0.237959
      2      0.734540
dtype: float64


In [30]:
# 使用字母去切片會報錯
try:
    print(data['a':'b'])
except KeyError as e:
    print("KeyError",e)

KeyError 'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [32]:
# 使用 DataFrame 內建方法排序索引
data = data.sort_index()
print(data)

# 可以使用字母序切片
print(data['a':'b'])

char  int
a     1      0.933669
      2      0.773842
b     1      0.237959
      2      0.734540
c     1      0.842450
      2      0.349751
dtype: float64
char  int
a     1      0.933669
      2      0.773842
b     1      0.237959
      2      0.734540
dtype: float64


In [38]:
print(pop)
# 指定要轉到欄的維度 (預設是最低維度)
print(pop.unstack())
print(pop.unstack(level=0))
print(pop.unstack(level=1))

# stack() 可以回復多重索引
print(pop.unstack().stack())

state       year
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64
year            2010      2020
state                         
California  37243956  39538223
New York    19378102  20201249
Texas       25145561  29145505
state  California  New York     Texas
year                                 
2010     37243956  19378102  25145561
2020     39538223  20201249  29145505
year            2010      2020
state                         
California  37243956  39538223
New York    19378102  20201249
Texas       25145561  29145505
state       year
California  2010    37243956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


In [40]:
# 將索引轉換成欄位
pop_flat = pop.reset_index(name='population')
print(pop_flat)

# 指定欄位轉換成多重索引
print(pop_flat.set_index(['state', 'year']))

        state  year  population
0  California  2010    37243956
1  California  2020    39538223
2    New York  2010    19378102
3    New York  2020    20201249
4       Texas  2010    25145561
5       Texas  2020    29145505
                 population
state      year            
California 2010    37243956
           2020    39538223
New York   2010    19378102
           2020    20201249
Texas      2010    25145561
           2020    29145505
