In [1]:
import pandas as pd

## 예제 4-10 멀티인덱스를 이용한 시리즈 활용 

In [2]:
index = [
    ('서울',2008),
    ('서울',2010),
    ('부산',2008),
    ('부산',2010), 
    ('인천',2008),
    ('인천',2010)
]

In [3]:
mul_index = pd.MultiIndex.from_tuples(index)
mul_index

MultiIndex([('서울', 2008),
            ('서울', 2010),
            ('부산', 2008),
            ('부산', 2010),
            ('인천', 2008),
            ('인천', 2010)],
           )

In [4]:
mul_index.values

array([('서울', 2008), ('서울', 2010), ('부산', 2008), ('부산', 2010),
       ('인천', 2008), ('인천', 2010)], dtype=object)

In [5]:
mul_index.value_counts()

(서울, 2008)    1
(서울, 2010)    1
(부산, 2008)    1
(부산, 2010)    1
(인천, 2008)    1
(인천, 2010)    1
dtype: int64

In [6]:
mul_index.levels

FrozenList([['부산', '서울', '인천'], [2008, 2010]])

In [7]:
populations = [30000,37000,18970,19370,20850,25140]

In [8]:
pop = pd.Series(populations,index = mul_index)
pop

서울  2008    30000
    2010    37000
부산  2008    18970
    2010    19370
인천  2008    20850
    2010    25140
dtype: int64

In [9]:
pop['서울']

2008    30000
2010    37000
dtype: int64

In [10]:
pop['서울',2008]

30000

In [11]:
pop[ : ,2010]

서울    37000
부산    19370
인천    25140
dtype: int64

문자열 인덱스인 경우 정렬이 되어 있지 않다면 슬라이싱 연산이 불가능하다.

In [12]:
try :
    pop["서울" : "인천"]
except Exception as e :
    print(e)

'Key length (1) was greater than MultiIndex lexsort depth (0)'


슬라이싱 연산을 하려면 인덱스 정렬이 필요!!

In [13]:
pop = pop.sort_index()
pop

부산  2008    18970
    2010    19370
서울  2008    30000
    2010    37000
인천  2008    20850
    2010    25140
dtype: int64

In [14]:
pop['서울':'인천']

서울  2008    30000
    2010    37000
인천  2008    20850
    2010    25140
dtype: int64

행 인덱스 정보를 index.names 에 직접 지정할 수 있다.

In [15]:
pop.index.names = ['시','년도']
pop

시   년도  
부산  2008    18970
    2010    19370
서울  2008    30000
    2010    37000
인천  2008    20850
    2010    25140
dtype: int64

In [16]:
df = pd.DataFrame(pop,columns=['인구수'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,인구수
시,년도,Unnamed: 2_level_1
부산,2008,18970
부산,2010,19370
서울,2008,30000
서울,2010,37000
인천,2008,20850
인천,2010,25140


In [17]:
df.loc['부산', : ]

Unnamed: 0_level_0,인구수
년도,Unnamed: 1_level_1
2008,18970
2010,19370


2010년도 인구수 집계하려면?  
데이터 프레임워크에서는 2010 인덱스로 바로 접근이 불가능하다  
시리즈로 가져와서 부분합을 구할 수 있다.

In [18]:
df['인구수'][ : ,2010]

시
부산    19370
서울    37000
인천    25140
Name: 인구수, dtype: int64

데이터 프레임으로 보고싶다면 시리즈의 분석결과를 다시 데이터 프레임으로 만들어 분석할 수 있다.

In [19]:
analysis = pd.DataFrame(df['인구수'][ : ,2010])
analysis

Unnamed: 0_level_0,인구수
시,Unnamed: 1_level_1
부산,19370
서울,37000
인천,25140


## 예제 4-11 멀티인덱스를 이용한 데이터프레임  활용 

In [20]:
r_inx = pd.MultiIndex.from_product([[2017,2018],[1,2]],
                                   names = ['년도','과제점수'])
r_inx

MultiIndex([(2017, 1),
            (2017, 2),
            (2018, 1),
            (2018, 2)],
           names=['년도', '과제점수'])

In [21]:
c_inx = pd.MultiIndex.from_product([['철수','영희','지원'],['컴공','경제']],
                                   names = ['학생','학과'])
c_inx

MultiIndex([('철수', '컴공'),
            ('철수', '경제'),
            ('영희', '컴공'),
            ('영희', '경제'),
            ('지원', '컴공'),
            ('지원', '경제')],
           names=['학생', '학과'])

In [22]:
import numpy as np

In [23]:
data = np.round(np.abs(np.random.randn(4,6)),1)
data

array([[1.4, 1.6, 0. , 0.3, 0.5, 1.1],
       [1. , 1.9, 1.4, 1.3, 0.7, 2.2],
       [0.4, 0. , 0.6, 0. , 0.2, 0. ],
       [0.8, 0.6, 2.3, 0.2, 1.1, 0.4]])

In [24]:
study_data = pd.DataFrame(data,index = r_inx, columns = c_inx)
study_data

Unnamed: 0_level_0,학생,철수,철수,영희,영희,지원,지원
Unnamed: 0_level_1,학과,컴공,경제,컴공,경제,컴공,경제
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,1.4,1.6,0.0,0.3,0.5,1.1
2017,2,1.0,1.9,1.4,1.3,0.7,2.2
2018,1,0.4,0.0,0.6,0.0,0.2,0.0
2018,2,0.8,0.6,2.3,0.2,1.1,0.4


In [25]:
study_data.index

MultiIndex([(2017, 1),
            (2017, 2),
            (2018, 1),
            (2018, 2)],
           names=['년도', '과제점수'])

In [26]:
study_data.columns

MultiIndex([('철수', '컴공'),
            ('철수', '경제'),
            ('영희', '컴공'),
            ('영희', '경제'),
            ('지원', '컴공'),
            ('지원', '경제')],
           names=['학생', '학과'])

In [28]:
study_data.index[0]

(2017, 1)

In [29]:
study_data.index.names[0]

'년도'

In [30]:
study_data.values

array([[1.4, 1.6, 0. , 0.3, 0.5, 1.1],
       [1. , 1.9, 1.4, 1.3, 0.7, 2.2],
       [0.4, 0. , 0.6, 0. , 0.2, 0. ],
       [0.8, 0.6, 2.3, 0.2, 1.1, 0.4]])

In [31]:
study_data['지원']

Unnamed: 0_level_0,학과,컴공,경제
년도,과제점수,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,1,0.5,1.1
2017,2,0.7,2.2
2018,1,0.2,0.0
2018,2,1.1,0.4


In [34]:
study_data.loc[ : , '지원']

Unnamed: 0_level_0,학과,컴공,경제
년도,과제점수,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,1,0.5,1.1
2017,2,0.7,2.2
2018,1,0.2,0.0
2018,2,1.1,0.4


In [32]:
study_data['지원','컴공']

년도    과제점수
2017  1       0.5
      2       0.7
2018  1       0.2
      2       1.1
Name: (지원, 컴공), dtype: float64

In [33]:
study_data.loc[ : , ('지원','컴공')]

년도    과제점수
2017  1       0.5
      2       0.7
2018  1       0.2
      2       1.1
Name: (지원, 컴공), dtype: float64

멀티 인덱스를 사용할 경우 pd.IndexSlice 를 사용해서 슬라이스를 만들어서 처리  
* loc[행,열] 로 표시하므로 멀티 인덱스 자체가 행과 열을 구성
* 멀티 인덱스 내의 특정 정보를 가져오려고 해도 슬라이스로 처리가 필요

In [35]:
study_data

Unnamed: 0_level_0,학생,철수,철수,영희,영희,지원,지원
Unnamed: 0_level_1,학과,컴공,경제,컴공,경제,컴공,경제
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,1.4,1.6,0.0,0.3,0.5,1.1
2017,2,1.0,1.9,1.4,1.3,0.7,2.2
2018,1,0.4,0.0,0.6,0.0,0.2,0.0
2018,2,0.8,0.6,2.3,0.2,1.1,0.4


정렬이 되어있어야 에러안남

In [36]:
try :
    analysis_ky = study_data.loc[:2018, '철수':'영희']
except Exception as e :
    print(e)
analysis_ky

'Key length (1) was greater than MultiIndex lexsort depth (0)'


NameError: name 'analysis_ky' is not defined

In [37]:
study_data = study_data.sort_index(axis = 1)
study_data

Unnamed: 0_level_0,학생,영희,영희,지원,지원,철수,철수
Unnamed: 0_level_1,학과,경제,컴공,경제,컴공,경제,컴공
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2017,1,0.3,0.0,1.1,0.5,1.6,1.4
2017,2,1.3,1.4,2.2,0.7,1.9,1.0
2018,1,0.0,0.6,0.0,0.2,0.0,0.4
2018,2,0.2,2.3,0.4,1.1,0.6,0.8


열 이름이 정렬이 되었기 때문에 에러는 안나지만 조회는 되지 않는다.

In [38]:
try :
    analysis_ky = study_data.loc[:2018, '철수':'영희']
except Exception as e :
    print(e)
analysis_ky

년도,과제점수
2017,1
2017,2
2018,1
2018,2


In [39]:
study_data.loc[ :2018,'영희':'지원']

Unnamed: 0_level_0,학생,영희,영희,지원,지원
Unnamed: 0_level_1,학과,경제,컴공,경제,컴공
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2017,1,0.3,0.0,1.1,0.5
2017,2,1.3,1.4,2.2,0.7
2018,1,0.0,0.6,0.0,0.2
2018,2,0.2,2.3,0.4,1.1


IndexSlice 속성을 사용하면 복합인덱스 차원에 맞게 검색이 가능하다.  
슬라이싱으로 시작과 끝의 조건을 지정할 수 도 있다.

In [40]:
study_data.loc[pd.IndexSlice[ : ,1],pd.IndexSlice[ : ,'컴공']]

Unnamed: 0_level_0,학생,영희,지원,철수
Unnamed: 0_level_1,학과,컴공,컴공,컴공
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2017,1,0.0,0.5,1.4
2018,1,0.6,0.2,0.4


In [41]:
study_data.loc[pd.IndexSlice[ : ,1:2],pd.IndexSlice[ : ,'컴공']]

Unnamed: 0_level_0,학생,영희,지원,철수
Unnamed: 0_level_1,학과,컴공,컴공,컴공
년도,과제점수,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2017,1,0.0,0.5,1.4
2017,2,1.4,0.7,1.0
2018,1,0.6,0.2,0.4
2018,2,2.3,1.1,0.8


In [42]:
study_data.index

MultiIndex([(2017, 1),
            (2017, 2),
            (2018, 1),
            (2018, 2)],
           names=['년도', '과제점수'])

In [43]:
study_data.index.levels[0].dtype

dtype('int64')

In [44]:
study_data.xs(('지원','컴공'),axis = 1)

년도    과제점수
2017  1       0.5
      2       0.7
2018  1       0.2
      2       1.1
Name: (지원, 컴공), dtype: float64