# Ch.05 Pandas

## 개요

  주로 데이터를 머신러닝에 이용하는 경우 스프레드시트 형식으로 2차원으로 준다.
  - 행 : Data Point로서 하나의 객체에 대한 관찰 값들을 이야기한다.
  - 열 : 특성(Feature)로서 해당 객체의 특성 값들을 이야기한다.

## Pandas의 대표적인 데이터타입

- Series: 어떤 관찰값(특성 한개)을 시간 순서나, 객체에 대해 나열해 놓은 것
- DataFrame : Series가 열로 합쳐진 것으로 생각하면 된다. '여러 특성'에 대해 시간 순서나 객체에 대해 저장한 것

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import random

os.path 모듈을 이용하면 현재 시스템에 저장되어 있는 파일의 경로정보를 조작할 수 있으며, 해당 경로에 특정 파일이 존재하는지 등의 정보를 얻을 수 있습니다.

보통 파일의 경로(path)는 문자열로 표현하기에 경로를 조작할때에는 문자열 연산을 많이 사용하게 됩니다. 따라서 os.path 모듈에서 대부분의 함수는 문자열을 매개변수로 받거나 결과로 문자열을 반환하게 됩니다.

In [2]:
import os
os.path

<module 'ntpath' from 'C:\\Anaconda\\lib\\ntpath.py'>

In [3]:
data = []
for i in range(4):
    data.append(random.randint(0,10))

obj = Series(data)
obj

0    8
1    5
2    8
3    2
dtype: int64

In [4]:
data

[8, 5, 8, 2]

## 행 색인 추가: df.index =[]

In [5]:
obj

0    8
1    5
2    8
3    2
dtype: int64

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj.index = ['a','b','c','d']
obj

a    8
b    5
c    8
d    2
dtype: int64

## Series(딕셔너리)
- key: index에 해당
- value : 데이터 요소값

## Series 인덱싱
  
- Series 객체[[해당 인덱스 이름 리스트]]
- Series 객체[[해당 인덱스 정수]]

In [8]:
dic = dict(zip(('A','B','C','D','E'), range(0,5)))
dic

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}

In [9]:
ds = Series(dic)
ds

A    0
B    1
C    2
D    3
E    4
dtype: int64

In [10]:
ds[['B','A','E']]

B    1
A    0
E    4
dtype: int64

In [11]:
ds['A':'C']

A    0
B    1
C    2
dtype: int64

In [12]:
ds[0:2]

A    0
B    1
dtype: int64

## 연산 규칙
같은 인덱스끼리 연산을 수행한다.

In [13]:
dic2 = dict(zip(('A','D','B','Z'),(10,11,12,13)))
ds2 = Series(dic2)

In [14]:
ds2

A    10
D    11
B    12
Z    13
dtype: int64

In [15]:
#if the indices don't match, returns NaN 
#인덱스가 다른 경우는 연산이 실행되지 않고 NaN 값 반환한다.
ds + ds2

A    10.0
B    13.0
C     NaN
D    14.0
E     NaN
Z     NaN
dtype: float64

## DataFrame : Series가 여러 개의 열(Columns)로 존재하는 스프레드 시트

## 데이터 접근
- df[[column]] : 기본적으로 DataFrame은 열을 우선으로 택하는 방식을 사용
- df[column][row] : 내부에는 각각 index name이 들어가야한다. 만약 index가 사용자가 설정한 것으로 되어 있는 경우 default로 적용되는 숫자 인덱스는 사용이 불가
- df.loc[row name, column name]
- df.iloc[row index, column index]

###### Making DataFrame Way 1

In [16]:
l = [[i for i in range(0,4)], [i for i in range(5,9)],[i for i in range(10,14)]]
l

[[0, 1, 2, 3], [5, 6, 7, 8], [10, 11, 12, 13]]

In [17]:
df = DataFrame(l)
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13


In [18]:
df.columns = ['col0','col1','col2','col3']
df.index = ['row0','row1','row2']
df

Unnamed: 0,col0,col1,col2,col3
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


###### Making DataFrame Way 2

In [19]:
df = DataFrame(l, index = ['row0','row1','row2'], 
               columns = ['col0','col1','col2','col3'])
df

Unnamed: 0,col0,col1,col2,col3
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


In [20]:
df[['col1','col3']]

Unnamed: 0,col1,col3
row0,1,3
row1,6,8
row2,11,13


In [21]:
#df['col1','col3'] >> error

In [22]:
df['col1']['row0']

1

In [23]:
#df[['col0':'col3']]['row0'] >> error

In [24]:
df[['col0','col3']]['row0':]
#it works... why..??

Unnamed: 0,col0,col3
row0,0,3
row1,5,8
row2,10,13


In [25]:
df.loc['row0',['col0','col3']]

col0    0
col3    3
Name: row0, dtype: int64

In [26]:
df.loc[:,['col0','col3']]

Unnamed: 0,col0,col3
row0,0,3
row1,5,8
row2,10,13


In [27]:
df.iloc[:2,:3]

Unnamed: 0,col0,col1,col2
row0,0,1,2
row1,5,6,7


In [28]:
df.iloc[1:3, 2:4]

Unnamed: 0,col2,col3
row1,7,8
row2,12,13


In [29]:
#change column names
#컬럼명 변경
df.columns=['c0','c1','c2','c3']
df

Unnamed: 0,c0,c1,c2,c3
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


In [30]:
#give a name for indices and columns
df.index.name = 'rowName'
df.columns.name = 'colName'
df

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


## 재색인 Reindex
index(row)나 column의 순서를 바꾸어 보고 싶은 경우

- dataframe객체.reindex([columns list], axis = 해당 열(0은 index, 1은 columns))

In [31]:
df.reindex(['c1','c0','c3','c2'], axis=1)

colName,c1,c0,c3,c2
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,1,0,3,2
row1,6,5,8,7
row2,11,10,13,12


In [32]:
df.reindex(['c1','c0','c3','c2'], axis=0)

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c1,,,,
c0,,,,
c3,,,,
c2,,,,


In [33]:
df.reindex(['row2','row1','row0'])

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row2,10,11,12,13
row1,5,6,7,8
row0,0,1,2,3


In [34]:
# original df won't be changed. need to assign to object
# 기존 객체는 변경 되지 않는다. 변경을 원할 경우 새로운 객체에 저장해야 해야한다.
df

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


In [35]:
# indices have to exist, if not returns NaN .
# 기존에 있는 인덱스로 reindex 하지 않으면 NaN값 반환
df.reindex(['a','b','c'])

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,,,,
b,,,,
c,,,,


## 행이나 열 삭제
- df.drop([행 or 열 이름 리스트], axis=0/1 , inplace=True/False )

default: axis =0, inplace = False
inplace : 기존 객체를 변경할지 여부. True일 경우 반환값이 없으며 객체 자체가 벼경된다.

In [36]:
df.drop(['row1'])

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,0,1,2,3
row2,10,11,12,13


In [37]:
df

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


In [38]:
#df.drop('c2') >> error, default: axis=0

In [39]:
df.drop(['c2'], axis=1)

colName,c0,c1,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
row0,0,1,3
row1,5,6,8
row2,10,11,13


In [40]:
df.drop(['c1','c2'], axis =1)

colName,c0,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1
row0,0,3
row1,5,8
row2,10,13


In [41]:
df_copy = df.copy()
df_copy.drop(['c1','c2'], axis=1, inplace=True)

In [42]:
df_copy

colName,c0,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1
row0,0,3
row1,5,8
row2,10,13


## DataFrame 연산 규칙
같은 행, 열 이름인 객체끼리 연산, 없는 경우 NaN값 반환

In [43]:
df2 = DataFrame(np.arange(1,13).reshape(3,4))
df2

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [44]:
df2.index = ['row2','row1','row2']
df2.columns = ['c1','c0','c2','c10']
df2

Unnamed: 0,c1,c0,c2,c10
row2,1,2,3,4
row1,5,6,7,8
row2,9,10,11,12


In [45]:
df

colName,c0,c1,c2,c3
rowName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
row0,0,1,2,3
row1,5,6,7,8
row2,10,11,12,13


In [46]:
df + df2

Unnamed: 0,c0,c1,c10,c2,c3
row0,,,,,
row1,11.0,11.0,,14.0,
row2,12.0,12.0,,15.0,
row2,20.0,20.0,,23.0,


## 정렬

data 값으로 정렬
- dataframe객체.sort_values(by=[기준이 될 행이나 열 이름 리스트], axis = 0/1, ascending = True(오름차순)/False(내림차순), inplace = True/False)

index 값으로 정렬
- dataframe객체.sort_index(axis = 0/1, ascending = True/False, inplace = True/False)

In [57]:
df_srt = DataFrame(np.random.randint(0,11,(3,4)))
df_srt

Unnamed: 0,0,1,2,3
0,10,10,2,5
1,3,8,6,4
2,8,9,1,3


In [58]:
df_srt.index = list('abc')
df_srt.columns = list('ABCD')
df_srt

Unnamed: 0,A,B,C,D
a,10,10,2,5
b,3,8,6,4
c,8,9,1,3


In [59]:
df_srt.sort_values(by = ['A'], ascending= True)

Unnamed: 0,A,B,C,D
b,3,8,6,4
c,8,9,1,3
a,10,10,2,5


In [62]:
#'by=' can be omit
# sort by more than two columns, 
##it will be sorted by the first column and then the second columns, so on 
# 두 개 이상의 기준일때 맨 앞의 기준으로 정렬된다.
#'by=' 생략 가능

df_srt.sort_values(['A','C'], ascending=True)

Unnamed: 0,A,B,C,D
b,3,8,6,4
c,8,9,1,3
a,10,10,2,5


In [67]:
df_srt.sort_values(by = ['a'], axis = 1, ascending=False)

Unnamed: 0,A,B,D,C
a,10,10,5,2
b,3,8,4,6
c,8,9,3,1


In [71]:
df_srt.sort_values(['a','b'],axis = 1, ascending=False)

Unnamed: 0,B,A,D,C
a,10,10,5,2
b,8,3,4,6
c,9,8,3,1


In [72]:
df_srt.sort_values(['a','b'],axis = 1, ascending=True)

Unnamed: 0,C,D,A,B
a,2,5,10,10
b,6,4,3,8
c,1,3,8,9


In [73]:
df_srt2=df_srt.sort_values(['a','b'],axis = 1, ascending=True)
df_srt2

Unnamed: 0,C,D,A,B
a,2,5,10,10
b,6,4,3,8
c,1,3,8,9


In [74]:
df_srt2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
a,10,10,2,5
b,3,8,6,4
c,8,9,1,3


In [75]:
df_srt2.sort_index(axis=0)

Unnamed: 0,C,D,A,B
a,2,5,10,10
b,6,4,3,8
c,1,3,8,9


In [76]:
df_srt2.sort_index(axis=0, ascending=False)

Unnamed: 0,C,D,A,B
c,1,3,8,9
b,6,4,3,8
a,2,5,10,10


In [78]:
df_srt2

Unnamed: 0,C,D,A,B
a,2,5,10,10
b,6,4,3,8
c,1,3,8,9


In [77]:
#데이터 ranking
df_srt2.rank(method = 'first')

Unnamed: 0,C,D,A,B
a,2.0,3.0,3.0,3.0
b,3.0,2.0,1.0,1.0
c,1.0,1.0,2.0,2.0


## 중복 색인 : 색인 값이 같은 것이 있는 것

색인 데이터가 많은 경우 색인 값이 중복되는 것이 있는지 확인해야 한다.
- dataframe객체.index.is_unique

In [79]:
arr = np.array([[i for i in range(5)],
               [3,6,2,3,5],
                [6,1,2,3,4],
               [9,3,5,7,8],
                [5,4,6,6,7]])
df3 = DataFrame(arr, index = ['a','a','b','b','c'])
df3

Unnamed: 0,0,1,2,3,4
a,0,1,2,3,4
a,3,6,2,3,5
b,6,1,2,3,4
b,9,3,5,7,8
c,5,4,6,6,7


In [80]:
df3.index

Index(['a', 'a', 'b', 'b', 'c'], dtype='object')

In [82]:
df3.index.unique()

Index(['a', 'b', 'c'], dtype='object')

In [83]:
df3.index.is_unique

False

In [86]:
A = Series(['a','b','c'])
A.is_unique

True

In [87]:
df3.loc['a'][:]

Unnamed: 0,0,1,2,3,4
a,0,1,2,3,4
a,3,6,2,3,5


## 통계 및 데이터 프레임 요약 정보

- dataframe객체.describe() : 평균, 분위값, 중앙값, 최소, 최댓갑 등등
- dataframe객체.info() : type, non-null 개수
- dataframe객체.isnull().sum() : column의 null 값 수

In [90]:
df3

Unnamed: 0,0,1,2,3,4
a,0,1,2,3,4
a,3,6,2,3,5
b,6,1,2,3,4
b,9,3,5,7,8
c,5,4,6,6,7


In [91]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to c
Data columns (total 5 columns):
0    5 non-null int32
1    5 non-null int32
2    5 non-null int32
3    5 non-null int32
4    5 non-null int32
dtypes: int32(5)
memory usage: 216.0+ bytes


In [93]:
df3.describe()

Unnamed: 0,0,1,2,3,4
count,5.0,5.0,5.0,5.0,5.0
mean,4.6,3.0,3.4,4.4,5.6
std,3.361547,2.12132,1.949359,1.949359,1.81659
min,0.0,1.0,2.0,3.0,4.0
25%,3.0,1.0,2.0,3.0,4.0
50%,5.0,3.0,2.0,3.0,5.0
75%,6.0,4.0,5.0,6.0,7.0
max,9.0,6.0,6.0,7.0,8.0


In [94]:
#corr matrix within each column
#각 열끼리의 corr matrix
df3.corr()

Unnamed: 0,0,1,2,3,4
0,1.0,0.070117,0.564639,0.717244,0.704161
1,0.070117,1.0,0.241825,0.181369,0.389249
2,0.564639,0.241825,1.0,0.934211,0.903652
3,0.717244,0.181369,0.934211,1.0,0.97425
4,0.704161,0.389249,0.903652,0.97425,1.0


In [95]:
#corr matrix within each rows
#각 행끼리의 corr matrix
df3.T.corr()

Unnamed: 0,a,a.1,b,b.1,c
a,1.0,0.096225,-0.164399,0.131306,0.83205
a,0.096225,1.0,-0.300567,-0.353779,-0.320256
b,-0.164399,-0.300567,1.0,0.949812,0.273576
b,0.131306,-0.353779,0.949812,1.0,0.528059
c,0.83205,-0.320256,0.273576,0.528059,1.0


In [96]:
#correlation with selected column
#지정한 열과 나머지 열끼리의 corr
df3.corrwith(df3[2])

0    0.564639
1    0.241825
2    1.000000
3    0.934211
4    0.903652
dtype: float64

## 값이 있는지 확인하기

In [98]:
df3

Unnamed: 0,0,1,2,3,4
a,0,1,2,3,4
a,3,6,2,3,5
b,6,1,2,3,4
b,9,3,5,7,8
c,5,4,6,6,7


In [99]:
df3.isin([1,4,5])

Unnamed: 0,0,1,2,3,4
a,False,True,False,False,True
a,False,False,False,False,True
b,False,True,False,False,True
b,False,False,True,False,False
c,True,True,False,False,False


## NaN값 처리하기

##### 삭제
- dataframe객체.dropna(axis = 0/1, how = 'any'/'all', inplace = True/False, subset(optional) =[고려할 columns]

default: axis=0, how='any', inplace = False

how='any' 하나라도 NaN값 있으면 그 행(axis=0) 또는 열(axis=1)을 삭제
how='all' 모든 값이 NaN값이면 그 행(axis=0) 또는 열(axis=1)을 삭제

<br></br>
##### 다른값으로 채우기
- dataframe객체.fillna(value = , method = , axis =0/1, inplace=True/False, limit= , downcast =)

In [131]:
df_na = DataFrame(np.random.randint(10, size = 12).reshape(3,4),
                 index = ['A','B','C'],
                 columns = ['c0','c1','c2','c3'])
df_na.loc['B','c1'] = np.nan
df_na

Unnamed: 0,c0,c1,c2,c3
A,5,7.0,5,1
B,9,,1,0
C,1,4.0,6,5


In [102]:
df_na.isnull()

Unnamed: 0,c0,c1,c2,c3
A,False,False,False,False
B,False,True,False,False
C,False,False,False,False


In [101]:
df_na.isnull().sum()

c0    0
c1    1
c2    0
c3    0
dtype: int64

In [104]:
df_na.dropna(how='all', axis=0)
#nothing changed
#해당 axis의 모든 요소가 Nan인 경우에 삭제

Unnamed: 0,c0,c1,c2,c3
A,2,9.0,8,5
B,5,,8,9
C,9,2.0,6,7


In [105]:
df_na.dropna()
#default : axis=0

Unnamed: 0,c0,c1,c2,c3
A,2,9.0,8,5
C,9,2.0,6,7


In [106]:
df_na.dropna(axis=1)

Unnamed: 0,c0,c2,c3
A,2,8,5
B,5,8,9
C,9,6,7


In [108]:
# original df won't be changed. need to assign to object or inplace=True
# 기존 객체는 변경 되지 않는다. 변경을 원할 경우 새로운 객체에 저장해야 해야한다.
df_na

Unnamed: 0,c0,c1,c2,c3
A,2,9.0,8,5
B,5,,8,9
C,9,2.0,6,7


In [132]:
df_na.dropna(subset=['c1'])

Unnamed: 0,c0,c1,c2,c3
A,5,7.0,5,1
C,1,4.0,6,5


In [135]:
df_na.dropna(subset=['c2'])

Unnamed: 0,c0,c1,c2,c3
A,5,7.0,5,1
B,9,,1,0
C,1,4.0,6,5


In [137]:
df_na

Unnamed: 0,c0,c1,c2,c3
A,5,7.0,5,1
B,9,,1,0
C,1,4.0,6,5


In [None]:
#df_na.dropna(subset=['c1'], axis=1) >> error

In [140]:
df_na.dropna(subset=['B'], axis=1)

Unnamed: 0,c0,c2,c3
A,5,5,1
B,9,1,0
C,1,6,5


In [109]:
df_na.dropna(axis=1, inplace=True)
df_na

Unnamed: 0,c0,c2,c3
A,2,8,5
B,5,8,9
C,9,6,7


In [119]:
df_na2 = DataFrame(np.random.randint(0,11,(3,4)),
                  index = ['A','B','C'],
                  columns= ['c0','c1','c2','c3'])
df_na2.loc['B'] = np.nan
df_na2

Unnamed: 0,c0,c1,c2,c3
A,10.0,8.0,3.0,5.0
B,,,,
C,5.0,0.0,3.0,3.0


In [120]:
df_na2.dropna(how='all', axis=0)

Unnamed: 0,c0,c1,c2,c3
A,10.0,8.0,3.0,5.0
C,5.0,0.0,3.0,3.0


In [121]:
df_na2.dropna(how='all', axis=1)

Unnamed: 0,c0,c1,c2,c3
A,10.0,8.0,3.0,5.0
B,,,,
C,5.0,0.0,3.0,3.0


In [122]:
df_na2.dropna(how='all')

Unnamed: 0,c0,c1,c2,c3
A,10.0,8.0,3.0,5.0
C,5.0,0.0,3.0,3.0


###### 채우기

In [141]:
df_fill = DataFrame(np.random.randint(10, size=12).reshape(3,4),
                   index = ['A','B','C'],
                   columns = ['c0','c1','c2','c3'])
df_fill.loc['B', 'c2'] = np.nan
df_fill.loc['B':, :"c1"] = np.nan
df_fill

Unnamed: 0,c0,c1,c2,c3
A,5.0,8.0,7.0,8
B,,,,3
C,,,2.0,5


In [142]:
df_fill.fillna({'c1':-10, 'c2':-20})

Unnamed: 0,c0,c1,c2,c3
A,5.0,8.0,7.0,8
B,,-10.0,-20.0,3
C,,-10.0,2.0,5


In [144]:
df_fill.fillna(-100)

Unnamed: 0,c0,c1,c2,c3
A,5.0,8.0,7.0,8
B,-100.0,-100.0,-100.0,3
C,-100.0,-100.0,2.0,5


In [146]:
# not applied to rows
# 행에는 적용이 안된다.
df_fill.fillna({'B':-20})

Unnamed: 0,c0,c1,c2,c3
A,5.0,8.0,7.0,8
B,,,,3
C,,,2.0,5


In [157]:
df_fill.fillna(method='backfill', axis=1)

Unnamed: 0,c0,c1,c2,c3
A,5.0,8.0,7.0,8.0
B,3.0,3.0,3.0,3.0
C,2.0,2.0,2.0,5.0


# 5.5 계층적 색인

row나 column에 색인이 여러겹으로 들어가는 것

계층적 색인 함수
- stack() : 계층적 색인으로 만들어준다.
- unstack() : stack()의 반대

In [169]:
S = Series(np.random.randn(12),
          index = [['a','a','a','b','b','b','c','c','c','d','d','d'],
                  [1,2,3,1,2,3,1,2,3,1,2,3]])
S.index.names = ['type1', 'type2']
S

type1  type2
a      1       -0.966896
       2        1.825665
       3        0.450235
b      1        1.038776
       2       -0.321837
       3       -0.181660
c      1       -0.598803
       2        0.449096
       3        0.567830
d      1        0.864688
       2        0.217794
       3       -0.786055
dtype: float64

In [170]:
S.unstack(level=0)

type1,a,b,c,d
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.966896,1.038776,-0.598803,0.864688
2,1.825665,-0.321837,0.449096,0.217794
3,0.450235,-0.18166,0.56783,-0.786055


In [171]:
S.unstack(level=1)

type2,1,2,3
type1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.966896,1.825665,0.450235
b,1.038776,-0.321837,-0.18166
c,-0.598803,0.449096,0.56783
d,0.864688,0.217794,-0.786055


In [172]:
S.unstack('type1')

type1,a,b,c,d
type2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.966896,1.038776,-0.598803,0.864688
2,1.825665,-0.321837,0.449096,0.217794
3,0.450235,-0.18166,0.56783,-0.786055


In [173]:
S.unstack('type2')

type2,1,2,3
type1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.966896,1.825665,0.450235
b,1.038776,-0.321837,-0.18166
c,-0.598803,0.449096,0.56783
d,0.864688,0.217794,-0.786055


In [174]:
S.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('c', 3),
            ('d', 1),
            ('d', 2),
            ('d', 3)],
           names=['type1', 'type2'])

In [175]:
S['b':'c']

type1  type2
b      1        1.038776
       2       -0.321837
       3       -0.181660
c      1       -0.598803
       2        0.449096
       3        0.567830
dtype: float64

In [178]:
S[:][:5]

type1  type2
a      1       -0.966896
       2        1.825665
       3        0.450235
b      1        1.038776
       2       -0.321837
dtype: float64

In [180]:
S2 = Series(np.random.randn(10),
           index =
           [[i for i in range(10)],
           'a,a,a,b,c,d,c,a,d,d'.split(','),
           'i0,i1,i2,i3,i0,i1,i2,i3,i4,i5'.split(','),
           'k0,k1,k2,k1,k2,k3,k4,k0,k1,k5'.split(',')])
S2

0  a  i0  k0   -0.560222
1  a  i1  k1    1.326100
2  a  i2  k2   -0.585858
3  b  i3  k1   -0.501922
4  c  i0  k2   -0.484825
5  d  i1  k3   -0.618886
6  c  i2  k4    2.060959
7  a  i3  k0   -0.316059
8  d  i4  k1    1.404494
9  d  i5  k5    1.872224
dtype: float64

In [181]:
S2.unstack()

Unnamed: 0,Unnamed: 1,Unnamed: 2,k0,k1,k2,k3,k4,k5
0,a,i0,-0.560222,,,,,
1,a,i1,,1.3261,,,,
2,a,i2,,,-0.585858,,,
3,b,i3,,-0.501922,,,,
4,c,i0,,,-0.484825,,,
5,d,i1,,,,-0.618886,,
6,c,i2,,,,,2.060959,
7,a,i3,-0.316059,,,,,
8,d,i4,,1.404494,,,,
9,d,i5,,,,,,1.872224


In [182]:
S2.unstack(level=0)

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,2,3,4,5,6,7,8,9
a,i0,k0,-0.560222,,,,,,,,,
a,i1,k1,,1.3261,,,,,,,,
a,i2,k2,,,-0.585858,,,,,,,
a,i3,k0,,,,,,,,-0.316059,,
b,i3,k1,,,,-0.501922,,,,,,
c,i0,k2,,,,,-0.484825,,,,,
c,i2,k4,,,,,,,2.060959,,,
d,i1,k3,,,,,,-0.618886,,,,
d,i4,k1,,,,,,,,,1.404494,
d,i5,k5,,,,,,,,,,1.872224


In [184]:
S2.unstack(level=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,a,b,c,d
0,i0,k0,-0.560222,,,
1,i1,k1,1.3261,,,
2,i2,k2,-0.585858,,,
3,i3,k1,,-0.501922,,
4,i0,k2,,,-0.484825,
5,i1,k3,,,,-0.618886
6,i2,k4,,,2.060959,
7,i3,k0,-0.316059,,,
8,i4,k1,,,,1.404494
9,i5,k5,,,,1.872224


In [185]:
S2.unstack(level =[0,1,2])

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0_level_1,a,a,a,b,c,d,c,a,d,d
Unnamed: 0_level_2,i0,i1,i2,i3,i0,i1,i2,i3,i4,i5
k0,-0.560222,,,,,,,-0.316059,,
k1,,1.3261,,-0.501922,,,,,1.404494,
k2,,,-0.585858,,-0.484825,,,,,
k3,,,,,,-0.618886,,,,
k4,,,,,,,2.060959,,,
k5,,,,,,,,,,1.872224


In [187]:
S2_st = S2.unstack(level = [0,1,2])

In [189]:
S2_st_2=S2_st.stack(level = [0,2])
S2_st_2

Unnamed: 0,Unnamed: 1,Unnamed: 2,a,b,c,d
k0,0,i0,-0.560222,,,
k0,7,i3,-0.316059,,,
k1,1,i1,1.3261,,,
k1,3,i3,,-0.501922,,
k1,8,i4,,,,1.404494
k2,2,i2,-0.585858,,,
k2,4,i0,,,-0.484825,
k3,5,i1,,,,-0.618886
k4,6,i2,,,2.060959,
k5,9,i5,,,,1.872224


In [191]:
S2_st_2.index.names = ['ind1','ind2','ind3']
S2_st_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a,b,c,d
ind1,ind2,ind3,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
k0,0,i0,-0.560222,,,
k0,7,i3,-0.316059,,,
k1,1,i1,1.3261,,,
k1,3,i3,,-0.501922,,
k1,8,i4,,,,1.404494
k2,2,i2,-0.585858,,,
k2,4,i0,,,-0.484825,
k3,5,i1,,,,-0.618886
k4,6,i2,,,2.060959,
k5,9,i5,,,,1.872224


In [192]:
S2_st_2.unstack(level='ind3')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,a,a,a,b,b,b,b,...,c,c,c,c,d,d,d,d,d,d
Unnamed: 0_level_1,ind3,i0,i1,i2,i3,i4,i5,i0,i1,i2,i3,...,i2,i3,i4,i5,i0,i1,i2,i3,i4,i5
ind1,ind2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
k0,0,-0.560222,,,,,,,,,,...,,,,,,,,,,
k0,7,,,,-0.316059,,,,,,,...,,,,,,,,,,
k1,1,,1.3261,,,,,,,,,...,,,,,,,,,,
k1,3,,,,,,,,,,-0.501922,...,,,,,,,,,,
k1,8,,,,,,,,,,,...,,,,,,,,,1.404494,
k2,2,,,-0.585858,,,,,,,,...,,,,,,,,,,
k2,4,,,,,,,,,,,...,,,,,,,,,,
k3,5,,,,,,,,,,,...,,,,,,-0.618886,,,,
k4,6,,,,,,,,,,,...,2.060959,,,,,,,,,
k5,9,,,,,,,,,,,...,,,,,,,,,,1.872224


# 5.5.2 단계별 요약통계

In [194]:
S2_st_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a,b,c,d
ind1,ind2,ind3,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
k0,0,i0,-0.560222,,,
k0,7,i3,-0.316059,,,
k1,1,i1,1.3261,,,
k1,3,i3,,-0.501922,,
k1,8,i4,,,,1.404494
k2,2,i2,-0.585858,,,
k2,4,i0,,,-0.484825,
k3,5,i1,,,,-0.618886
k4,6,i2,,,2.060959,
k5,9,i5,,,,1.872224


In [195]:
S2_st_2.sum(level='ind3')

Unnamed: 0_level_0,a,b,c,d
ind3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i0,-0.560222,0.0,-0.484825,0.0
i3,-0.316059,-0.501922,0.0,0.0
i1,1.3261,0.0,0.0,-0.618886
i4,0.0,0.0,0.0,1.404494
i2,-0.585858,0.0,2.060959,0.0
i5,0.0,0.0,0.0,1.872224
