In [2]:
import pandas as pd
import numpy as np

In [5]:
#Pandas MultiIndex
index = [("California", 2000), ("California", 2010), ("New York", 2000), ("New York", 2010), ("Texas", 2000), ('Texas', 2010)]
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [9]:
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [12]:
pop_df = pop.unstack()
print(pop_df) # 다중인덱스를 가진 Series를 전형적인 인덱스를 가진 DF로 바꿔주는것이 unstack
print(pop_df.stack())

                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


In [15]:
##계층적 인덱싱의 유용성
pop_df = pd.DataFrame({"total": pop, "under18" : [9267089, 9284094, 4687374, 4318033, 5906301, 6879014]})
f_u18 = pop_df["under18"] / pop_df["total"]
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


In [17]:
##멀티 인덱스 생성
pd.MultiIndex.from_arrays([["a","a","b","b"],[1,2,1,2]])
pd.MultiIndex.from_tuples([("a",1),("a",2),("b",1),("b",2)])
pd.MultiIndex.from_product([["a","b"],[1,2]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [19]:
##멀티 인덱스 열의 이름
index = pd.MultiIndex.from_product([[2013,2014],[1,2]], names=["year","visit"])
columns = pd.MultiIndex.from_product([["Bob","Guido","Sue"],["HR","Temp"]], names= ["subject","type"])

data = np.round(np.random.randn(4,6),1)
data[:,::2] *= 10
data += 37

health_data = pd.DataFrame(data,index=index, columns = columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,32.0,36.9,33.0,39.0,31.0,36.2
2013,2,21.0,37.1,22.0,37.2,24.0,38.1
2014,1,51.0,36.5,49.0,36.5,36.0,37.7
2014,2,44.0,35.5,38.0,37.8,35.0,35.5


In [24]:
##멀티 인덱스 인덱싱과 슬라이싱 하기
###다중인덱스 가진 Series
print(pop["California"])
print(pop[pop > 20000000])

2000    33871648
2010    37253956
dtype: int64
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64


In [28]:
###다중인덱스 가진 DataFrame
health_data["Guido","HR"]  #다중인덱스 DF의 경우 인덱싱이 열로 작용함

year  visit
2013  1        33.0
      2        22.0
2014  1        49.0
      2        38.0
Name: (Guido, HR), dtype: float64

In [30]:
print(health_data.iloc[:2,:2])
print(health_data.loc[:,("Bob","HR")])

subject      Bob      
type          HR  Temp
year visit            
2013 1      32.0  36.9
     2      21.0  37.1
year  visit
2013  1        32.0
      2        21.0
2014  1        51.0
      2        44.0
Name: (Bob, HR), dtype: float64


In [31]:
idx = pd.IndexSlice
health_data.loc[idx[:,1],idx[:,"HR"]]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,32.0,33.0,31.0
2014,1,51.0,49.0,36.0


In [32]:
###다중인덱스 => 정렬되어있다는 가정 => 따라서 정렬해줘야 함
index = pd.MultiIndex.from_product([["a","c","b"], [1,2]])
data = pd.Series(np.random.rand(6), index=index)
data

a  1    0.079042
   2    0.377538
c  1    0.232420
   2    0.131169
b  1    0.148481
   2    0.343979
dtype: float64

In [35]:
#data["a","b"] #정렬이 되지 않아서 인덱싱의 오류 

In [37]:
data = data.sort_index()
print(data)
print(data["a":"b"])

a  1    0.079042
   2    0.377538
b  1    0.148481
   2    0.343979
c  1    0.232420
   2    0.131169
dtype: float64
a  1    0.079042
   2    0.377538
b  1    0.148481
   2    0.343979
dtype: float64


In [42]:
print(pop.unstack(level=0))
print(pop.unstack(level=1))

      California  New York     Texas
2000    33871648  18976457  20851820
2010    37253956  19378102  25145561
                2000      2010
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


In [54]:
pop.index.names = ["state","year"]
pop_flat = pop.reset_index(name="population")   #모든 인덱스를 열로 바꿔주면서 df형성 #name=>지정하면 value의 컬럼 이름 지정 가능
pop_flat

Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [57]:
#반대로 다시 set_index
pop_flat.set_index(["state","year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


In [61]:
#다중데이터셋 연산
data_mean = health_data.mean(level="year")
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,26.5,37.0,27.5,38.1,27.5,37.15
2014,47.5,36.0,43.5,37.15,35.5,36.6


In [65]:
data_mean.mean(axis=1, level="type")

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,27.166667,37.416667
2014,42.166667,36.583333
