In [1]:
import numpy as np, pandas as pd

In [2]:
# hierarchical indexing / multi-indexing
# MultiIndex Object

In [3]:
# Sereis w/ multi-indexing

# POOR method (index as list of tuples)
index = [("California", 2000), ("California", 2010),
         ("New York", 2000), ("New York", 2010),
         ("Texas", 2000), ("Texas", 2010)]

populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561,]

pop = pd.Series(populations, index=index)
print(pop, "\n")

# slicing with multi-indexing
print(pop[("California", 2000):("New York", 2010)], "\n")

# not effective (ex. select data for year 2010)
print(pop[[ind for ind in pop.index if ind[1] == 2010]])

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64 

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
dtype: int64 

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64


In [4]:
# BETTER method (Pandas MultiIndex)
index = pd.MultiIndex.from_tuples(index)
print(index, "\n")

# reindex using MultiIndex
pop = pop.reindex(index)
print(pop, "\n")

# effective (select data from year 2010)
print(pop[:,2010])

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           ) 

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 

California    37253956
New York      19378102
Texas         25145561
dtype: int64


In [5]:
# MultiIndex: 추가 지원

# .unstack() = Series w/ multi-indexing -> 2D DataFrame
pop_df = pop.unstack()
print(pop_df, "\n")

# .stack() = reverse
print(pop_df.stack(), "\n")

# add information (add a column in as dictionary)
pop_df = pd.DataFrame({"total": pop,
                       "under18": [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
print(pop_df, "\n")

# calculate under18 ratio
f_u18 = pop_df["under18"] / pop_df["total"]
f_u18.unstack()

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561 

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 

                    total  under18
California 2000  33871648  9267089
           2010  37253956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014 



Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [6]:
# MultiIndex 생성 메서드
# 2개 이상 인덱스 배열 리스트
df = pd.DataFrame(np.random.rand(4,2),
                  index=[["a","b","c","d",],[1,2,1,2,]],
                  columns=["data1","data2"])
print(df, "\n")

# implicit MultiIndex
data = {("California", 2000): 33871648, ("California", 2010): 37253956,
        ("New York", 2000): 18976457, ("New York", 2010): 19378102,
        ("Texas", 2000): 20851820, ("Texas", 2010): 25145561,}

print(pd.Series(data), "\n")
print(pd.Series(data).index)

        data1     data2
a 1  0.065311  0.408769
b 2  0.998321  0.691757
c 1  0.701362  0.082941
d 2  0.438135  0.406391 

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )


In [7]:
# 명시적 MultiIndex 생성자
print(pd.MultiIndex.from_arrays([["a","a","b","b"], [1,2,1,2,]]), "\n")
print(pd.MultiIndex.from_tuples([("a",1), ("a",2), ("b",1), ("b",2)]), "\n")
# Cartesian Product
print(pd.MultiIndex.from_product([["a","b"], [1,2]]), "\n")

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           ) 

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           ) 

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           ) 



In [8]:
# MultiIndex 레벨 이름
pop.index.names = ["state", "year"]
print(pop, "\n")

# 열의 MultiIndex
# 계층적 인덱스와 열
index = pd.MultiIndex.from_product([[2013, 2014], [1,2]],
                                   names=["year", "visit"])
columns = pd.MultiIndex.from_product([["Bob", "Guido", "Sue"], ["HR", "Temp"]],
                                     names=["subject", "type"])

# 일부 데이터 모형 만들기
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# DataFrame 생성 (4차원)
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 



Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,35.0,37.5,30.0,36.9,42.0,34.9
2013,2,29.0,39.8,46.0,37.0,35.0,38.4
2014,1,27.0,37.3,33.0,37.1,35.0,38.7
2014,2,15.0,36.0,39.0,38.0,35.0,37.0


In [9]:
# subject, type, year, visit

# 최상위 열의 인덱스 (subject)
print(health_data["Guido"], "\n")

# 최상위 행의 인덱스 (year)
print(health_data.loc[2013], "\n")

type          HR  Temp
year visit            
2013 1      30.0  36.9
     2      46.0  37.0
2014 1      33.0  37.1
     2      39.0  38.0 

subject   Bob       Guido         Sue      
type       HR  Temp    HR  Temp    HR  Temp
visit                                      
1        35.0  37.5  30.0  36.9  42.0  34.9
2        29.0  39.8  46.0  37.0  35.0  38.4 



In [10]:
# MultiIndex 인덱싱 및 슬라이싱

# Series: MultiIndex
print(pop, "\n")
print(pop["California", 2000], "\n") # 단일 요소 접근
print(pop["California"], "\n") # 단일 인덱스 레벨
print(pop["California":"New York"], "\n") # 슬라이싱

# 첫 인덱스 빈 슬라이스 -> 낮은 레벨 부분 인덱싱
print(pop[:, 2000], "\n")

print(pop[pop > 22000000], "\n") # 부울 마스킹
print(pop[["California", "Texas"]], "\n") # 팬시 인덱싱

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64 

33871648 

year
2000    33871648
2010    37253956
dtype: int64 

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64 

state
California    33871648
New York      18976457
Texas         20851820
dtype: int64 

state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64 

state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64 



In [11]:
# DataFrame: MultiIndex
print(health_data, "\n")
print(health_data["Guido", "HR"], "\n") # DataFrame[,] -> select from column
print(health_data.iloc[:2, :2], "\n") # .iloc[row, col] / end index exclusive
print(health_data.loc[:, ("Bob", "HR")], "\n") # .loc[row, column selection as tuple]

# IndexSlice Object
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, "HR"]]
# sliced row to visit1, sliced column to HR

subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      35.0  37.5  30.0  36.9  42.0  34.9
     2      29.0  39.8  46.0  37.0  35.0  38.4
2014 1      27.0  37.3  33.0  37.1  35.0  38.7
     2      15.0  36.0  39.0  38.0  35.0  37.0 

year  visit
2013  1        30.0
      2        46.0
2014  1        33.0
      2        39.0
Name: (Guido, HR), dtype: float64 

subject      Bob      
type          HR  Temp
year visit            
2013 1      35.0  37.5
     2      29.0  39.8 

year  visit
2013  1        35.0
      2        29.0
2014  1        27.0
      2        15.0
Name: (Bob, HR), dtype: float64 



Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,35.0,30.0,42.0
2014,1,27.0,33.0,35.0


In [12]:
# 다중 인덱스 재정렬
# MultiIndex -> work w/ sorted index

index = pd.MultiIndex.from_product([["a","c","b"], [1,2]]) # 정렬 되지 않은 인덱스
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ["char", "int"]
print(data, "\n")

# 정렬 되지 않은 인덱스 -> 오류
print(data["a":"b"], "\n")

char  int
a     1      0.948211
      2      0.688066
c     1      0.971403
      2      0.135903
b     1      0.414315
      2      0.894176
dtype: float64 



UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

In [None]:
# Pandas 인덱스 정렬 (.sort_index(), .sortlevel())
data = data.sort_index()
print(data, "\n")
print(data["a":"b"], "\n")

# 인덱스 스태킹 및 언스태킹
print(pop.unstack(level=0), "\n")
print(pop.unstack(level=1), "\n")
print(pop.unstack().stack()) # unstack

# 인덱스 설정 및 재설정
# reset_index
pop_flat = pop.reset_index(name="population")
print(pop_flat, "\n")

# set_index
print(pop_flat.set_index(["state", "year"]), "\n")

In [None]:
# 다중 인덱스 데이터 집계
print(health_data, "\n")

# groupby level
data_mean = health_data.groupby(level="year", axis=0).mean()
print(data_mean, "\n")

# if level is row -> axis=0, if level is col -> axis=1
print(data_mean.groupby(level="type", axis=1).mean(), "\n")