<a href="https://colab.research.google.com/github/hanna-joo/bigdata_edu/blob/master/python_edu/pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html)

In [1]:
import numpy as np
import pandas as pd

# 1. 변수 생성 (Object creation)

In [2]:
# Series 생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
# DataFrame 생성: Numpy 행렬
dates = pd.date_range('20130101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-05,0.110328,-1.978644,-0.539618,0.463889
2013-01-06,1.330928,0.215869,0.469587,0.242428


In [4]:
# DataFrame 생성: 딕셔너리
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# 2. 데이터 확인 (Viewing data)


In [5]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103


In [6]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [8]:
# Numpy 행렬에는 단 한가지의 dtype만 가능
# Pandas 데이터 프레임에는 컬럼당 한가지의 dtype만 가능
df.to_numpy()

array([[ 1.25304659, -0.47957128,  0.21116508,  0.11484311],
       [-0.53246754, -0.97286903,  0.61318998, -0.70010272],
       [ 0.63694039, -0.70370211, -0.51265678,  0.33334992],
       [ 1.10827239, -1.7292086 , -1.44401923,  0.0225675 ],
       [ 0.11032805, -1.97864413, -0.53961775,  0.46388882],
       [ 1.33092799,  0.21586877,  0.46958678,  0.24242845]])

In [9]:
# 여러 dtype이 존재하는 df2를 numpy로 변환 시 모든 것을 아우르는 dtype 선택
  # DataFrame.to_numpy(): 인덱스와 컬럼명은 포함되지 않음
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [10]:
# df의 통계 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.651175,-0.941354,-0.200392,0.079496
std,0.738509,0.813212,0.779661,0.412556
min,-0.532468,-1.978644,-1.444019,-0.700103
25%,0.241981,-1.540124,-0.532878,0.045636
50%,0.872606,-0.838286,-0.150746,0.178636
75%,1.216853,-0.535604,0.404981,0.31062
max,1.330928,0.215869,0.61319,0.463889


In [11]:
# df의 행과 열 바꾸기
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.253047,-0.532468,0.63694,1.108272,0.110328,1.330928
B,-0.479571,-0.972869,-0.703702,-1.729209,-1.978644,0.215869
C,0.211165,0.61319,-0.512657,-1.444019,-0.539618,0.469587
D,0.114843,-0.700103,0.33335,0.022568,0.463889,0.242428


In [12]:
# 행 또는 열 값에 따라 나열하기
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.114843,0.211165,-0.479571,1.253047
2013-01-02,-0.700103,0.61319,-0.972869,-0.532468
2013-01-03,0.33335,-0.512657,-0.703702,0.63694
2013-01-04,0.022568,-1.444019,-1.729209,1.108272
2013-01-05,0.463889,-0.539618,-1.978644,0.110328
2013-01-06,0.242428,0.469587,0.215869,1.330928


In [13]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,1.330928,0.215869,0.469587,0.242428
2013-01-05,0.110328,-1.978644,-0.539618,0.463889
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-01,1.253047,-0.479571,0.211165,0.114843


In [14]:
# 값에 따라 나열하기
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-05,0.110328,-1.978644,-0.539618,0.463889
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-06,1.330928,0.215869,0.469587,0.242428


In [15]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,1.330928,0.215869,0.469587,0.242428
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-05,0.110328,-1.978644,-0.539618,0.463889


# 3. 데이터 선택 (Selection)
- .at / .iat
- .loc / .iloc
- .isin

In [16]:
# 해당하는 열 가져오기
df['A']  # df.A

2013-01-01    1.253047
2013-01-02   -0.532468
2013-01-03    0.636940
2013-01-04    1.108272
2013-01-05    0.110328
2013-01-06    1.330928
Freq: D, Name: A, dtype: float64

In [17]:
# 해당하는 행 가져오기
df[2:4]  # df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-04,1.108272,-1.729209,-1.444019,0.022568


## 인덱스 또는 컬럼명으로 가져오기 (.loc)

In [18]:
# 인덱스 값으로 가져오기
df.loc['20130101']

A    1.253047
B   -0.479571
C    0.211165
D    0.114843
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
# 지정한 행과 열에 해당하는 값 가져오기
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,1.253047,-0.479571
2013-01-02,-0.532468,-0.972869
2013-01-03,0.63694,-0.703702
2013-01-04,1.108272,-1.729209
2013-01-05,0.110328,-1.978644
2013-01-06,1.330928,0.215869


In [20]:
df.loc['20130102':'20130104', 'A':'C']

Unnamed: 0,A,B,C
2013-01-02,-0.532468,-0.972869,0.61319
2013-01-03,0.63694,-0.703702,-0.512657
2013-01-04,1.108272,-1.729209,-1.444019


In [21]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,-0.532468,0.61319
2013-01-03,0.63694,-0.512657
2013-01-04,1.108272,-1.444019


In [22]:
df.loc[dates[0],'A']

1.2530465933588295

In [23]:
# .at: scalar 가져올 때 .loc 보다 더 빠른 방법
df.at[dates[0],'A']

1.2530465933588295

## 좌표값으로 가져오기 (.iloc)

In [24]:
df.iloc[2]

A    0.636940
B   -0.703702
C   -0.512657
D    0.333350
Name: 2013-01-03 00:00:00, dtype: float64

In [25]:
df.iloc[3:5]  # df[3:5] / df['20130104':'20130105']

Unnamed: 0,A,B,C,D
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-05,0.110328,-1.978644,-0.539618,0.463889


In [26]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,1.108272,-1.729209
2013-01-05,0.110328,-1.978644


In [27]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.532468,0.61319
2013-01-03,0.63694,-0.512657
2013-01-05,0.110328,-0.539618


In [28]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-03,0.63694,-0.703702,-0.512657,0.33335


In [29]:
df.iloc[1,1]

-0.9728690327119722

In [30]:
# .iat: scalar 가져올 때 .iloc 보다 더 빠른 방법
df.iat[1,1]

-0.9728690327119722

## 조건에 해당하는 값 가져오기 (.isin)

In [31]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-05,0.110328,-1.978644,-0.539618,0.463889
2013-01-06,1.330928,0.215869,0.469587,0.242428


In [32]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.253047,,0.211165,0.114843
2013-01-02,,,0.61319,
2013-01-03,0.63694,,,0.33335
2013-01-04,1.108272,,,0.022568
2013-01-05,0.110328,,,0.463889
2013-01-06,1.330928,0.215869,0.469587,0.242428


In [33]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.253047,-0.479571,0.211165,0.114843,one
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103,one
2013-01-03,0.63694,-0.703702,-0.512657,0.33335,two
2013-01-04,1.108272,-1.729209,-1.444019,0.022568,three
2013-01-05,0.110328,-1.978644,-0.539618,0.463889,four
2013-01-06,1.330928,0.215869,0.469587,0.242428,three


In [34]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.63694,-0.703702,-0.512657,0.33335,two
2013-01-05,0.110328,-1.978644,-0.539618,0.463889,four


## 값 설정하기

In [35]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.253047,-0.479571,0.211165,0.114843
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103
2013-01-03,0.63694,-0.703702,-0.512657,0.33335
2013-01-04,1.108272,-1.729209,-1.444019,0.022568
2013-01-05,0.110328,-1.978644,-0.539618,0.463889
2013-01-06,1.330928,0.215869,0.469587,0.242428


In [37]:
# 새로운 컬럼 추가(인덱스에 해당하는 값만 가져옴)
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.253047,-0.479571,0.211165,0.114843,
2013-01-02,-0.532468,-0.972869,0.61319,-0.700103,1.0
2013-01-03,0.63694,-0.703702,-0.512657,0.33335,2.0
2013-01-04,1.108272,-1.729209,-1.444019,0.022568,3.0
2013-01-05,0.110328,-1.978644,-0.539618,0.463889,4.0
2013-01-06,1.330928,0.215869,0.469587,0.242428,5.0


In [38]:
# 새로운 값으로 바꾸기
df.at[dates[0],'A'] = 0
df.iat[0, 1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.211165,5,
2013-01-02,-0.532468,-0.972869,0.61319,5,1.0
2013-01-03,0.63694,-0.703702,-0.512657,5,2.0
2013-01-04,1.108272,-1.729209,-1.444019,5,3.0
2013-01-05,0.110328,-1.978644,-0.539618,5,4.0
2013-01-06,1.330928,0.215869,0.469587,5,5.0


In [39]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.211165,-5,
2013-01-02,-0.532468,-0.972869,-0.61319,-5,-1.0
2013-01-03,-0.63694,-0.703702,-0.512657,-5,-2.0
2013-01-04,-1.108272,-1.729209,-1.444019,-5,-3.0
2013-01-05,-0.110328,-1.978644,-0.539618,-5,-4.0
2013-01-06,-1.330928,-0.215869,-0.469587,-5,-5.0


# 4. 결측치 처리 (Missing data)

In [40]:
# 인덱스 변경/추가/삭제: reindex
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.211165,5,,1.0
2013-01-02,-0.532468,-0.972869,0.61319,5,1.0,1.0
2013-01-03,0.63694,-0.703702,-0.512657,5,2.0,
2013-01-04,1.108272,-1.729209,-1.444019,5,3.0,


In [41]:
# 결측치 있는 행 모두 삭제
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,-0.532468,-0.972869,0.61319,5,1.0,1.0


In [42]:
# 모든 결측치 값에 5 넣기
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.211165,5,5.0,1.0
2013-01-02,-0.532468,-0.972869,0.61319,5,1.0,1.0
2013-01-03,0.63694,-0.703702,-0.512657,5,2.0,5.0
2013-01-04,1.108272,-1.729209,-1.444019,5,3.0,5.0


In [43]:
# 특정 컬럼의 결측치 값에 5 넣기
df1.F.fillna(value=5)

2013-01-01    5.0
2013-01-02    1.0
2013-01-03    2.0
2013-01-04    3.0
Freq: D, Name: F, dtype: float64

In [44]:
# 결측치 있는지 없는지 위치 확인하기
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 5. 조작 (Operations)

## 데이터 프레임의 기본 통계 구하기

In [45]:
print(df.mean())  # 컬럼별 평균
print(df.mean(1)) # 행별 평균

A    0.442334
B   -0.861426
C   -0.200392
D    5.000000
F    3.000000
dtype: float64
2013-01-01    1.302791
2013-01-02    1.021571
2013-01-03    1.284116
2013-01-04    1.187009
2013-01-05    1.318413
2013-01-06    2.403277
Freq: D, dtype: float64


In [46]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [47]:
print(df)
df.sub(s, axis='index')

                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.211165  5  NaN
2013-01-02 -0.532468 -0.972869  0.613190  5  1.0
2013-01-03  0.636940 -0.703702 -0.512657  5  2.0
2013-01-04  1.108272 -1.729209 -1.444019  5  3.0
2013-01-05  0.110328 -1.978644 -0.539618  5  4.0
2013-01-06  1.330928  0.215869  0.469587  5  5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.36306,-1.703702,-1.512657,4.0,1.0
2013-01-04,-1.891728,-4.729209,-4.444019,2.0,0.0
2013-01-05,-4.889672,-6.978644,-5.539618,0.0,-1.0
2013-01-06,,,,,


## 데이터 프레임에 함수 적용하기: df.apply()

In [48]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.211165,5,
2013-01-02,-0.532468,-0.972869,0.61319,5,1.0
2013-01-03,0.63694,-0.703702,-0.512657,5,2.0
2013-01-04,1.108272,-1.729209,-1.444019,5,3.0
2013-01-05,0.110328,-1.978644,-0.539618,5,4.0
2013-01-06,1.330928,0.215869,0.469587,5,5.0


In [49]:
# df의 컬럼별로 누적합 구하기
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.211165,5,
2013-01-02,-0.532468,-0.972869,0.824355,10,1.0
2013-01-03,0.104473,-1.676571,0.311698,15,3.0
2013-01-04,1.212745,-3.40578,-1.132321,20,6.0
2013-01-05,1.323073,-5.384424,-1.671939,25,10.0
2013-01-06,2.654001,-5.168555,-1.202352,30,15.0


In [50]:
# df의 컬럼별로 lambda함수 적용하기
df.apply(lambda x:x.max()-x.min())

A    1.863396
B    2.194513
C    2.057209
D    0.000000
F    4.000000
dtype: float64

## 열의 value 개수 세기: s.value_counts()

In [51]:
s = pd.Series(np.random.randint(0,7,size=10))
s

0    1
1    2
2    1
3    2
4    4
5    6
6    5
7    4
8    0
9    3
dtype: int64

In [52]:
s.value_counts()

4    2
2    2
1    2
6    1
5    1
3    1
0    1
dtype: int64

## 문자열 조작하기

In [53]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [54]:
# 대문자 > 소문자로 만들기
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 6. 합치기 (Merge)
- pd.concat()
- pd.merge()

In [55]:
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
# 데이터 프레임 여러 조각으로 쪼개기
pieces = [df[:3],df[3:7],df[7:]]
pd.concat(pieces)

          0         1         2         3
0  0.356017 -0.075286 -0.663907 -0.001398
1  0.728654 -0.020000  0.732012  0.831441
2  0.889750 -0.216586  0.038212 -1.671312
3  1.963342  0.925283 -1.623007  0.627622
4 -1.774434 -0.213286  1.079440 -0.730634
5  0.481552 -0.237820 -0.919514 -1.988382
6 -0.693658  1.211927 -0.078029  0.415056
7 -1.194455 -1.444781  1.134752 -0.463116
8 -1.131898 -0.592422 -0.646861 -0.513575
9  0.370331 -0.297225  0.622273  1.136625


Unnamed: 0,0,1,2,3
0,0.356017,-0.075286,-0.663907,-0.001398
1,0.728654,-0.02,0.732012,0.831441
2,0.88975,-0.216586,0.038212,-1.671312
3,1.963342,0.925283,-1.623007,0.627622
4,-1.774434,-0.213286,1.07944,-0.730634
5,0.481552,-0.23782,-0.919514,-1.988382
6,-0.693658,1.211927,-0.078029,0.415056
7,-1.194455,-1.444781,1.134752,-0.463116
8,-1.131898,-0.592422,-0.646861,-0.513575
9,0.370331,-0.297225,0.622273,1.136625


In [56]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
print(left,'\n')
print(right,'\n')
pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  foo     2 

   key  rval
0  foo     4
1  foo     5 



Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [57]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
print(left,'\n')
print(right,'\n')
pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  bar     2 

   key  rval
0  foo     4
1  bar     5 



Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# 7. 그룹화하기 (Grouping)
- step1: 데이터 쪼개기
- step2: 각 그룹별로 함수 적용하기
- step3: 결과 결합하기

In [58]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                        'foo', 'bar', 'foo', 'foo'],
                  'B': ['one', 'one', 'two', 'three',
                        'two', 'two', 'one', 'three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,1.671758,-0.094676
1,bar,one,0.318379,-1.262779
2,foo,two,-3.040889,-0.311384
3,bar,three,0.459314,-0.003056
4,foo,two,-0.022407,-0.628637
5,bar,two,-2.226172,-0.883716
6,foo,one,1.276158,-1.128245
7,foo,three,0.829766,1.558866


In [59]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.448479,-2.14955
foo,0.714385,-0.604076


In [60]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.318379,-1.262779
bar,three,0.459314,-0.003056
bar,two,-2.226172,-0.883716
foo,one,2.947916,-1.222922
foo,three,0.829766,1.558866
foo,two,-3.063296,-0.94002


# 8. 재구조화하기 (Reshaping)

## df.stack()

In [67]:
# 인덱스 생성하기
tuples = list(zip(*[['bar','bar','baz','baz',
                     'foo','foo','qux','qux'],
                    ['one','two','one','two',
                     'one','two','one','two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [66]:
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A','B'])
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.014607,0.814861
bar,two,0.881272,0.6306
baz,one,-0.085705,0.525107
baz,two,0.14265,0.462959


In [70]:
# 데이터프레임 시리즈로 압축하기
stacked = df2.stack()
stacked  # MultiIndex 가지고 있음

first  second   
bar    one     A   -0.014607
               B    0.814861
       two     A    0.881272
               B    0.630600
baz    one     A   -0.085705
               B    0.525107
       two     A    0.142650
               B    0.462959
dtype: float64

In [71]:
# 압축 풀기
stacked.unstack()  # default는 마지막 수준을 unstack하기

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.014607,0.814861
bar,two,0.881272,0.6306
baz,one,-0.085705,0.525107
baz,two,0.14265,0.462959


In [72]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.014607,0.881272
bar,B,0.814861,0.6306
baz,A,-0.085705,0.14265
baz,B,0.525107,0.462959


In [73]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.014607,-0.085705
one,B,0.814861,0.525107
two,A,0.881272,0.14265
two,B,0.6306,0.462959


## pd.pivot_table()

In [74]:
df = pd.DataFrame({'A':['one','one','two','three']*3,
                   'B':['A','B','C']*4,
                   'C':['foo','foo','foo','bar','bar','bar']*2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1.541496,2.465006
1,one,B,foo,1.924193,-1.623384
2,two,C,foo,1.236278,-1.364102
3,three,A,bar,0.330156,-0.085709
4,one,B,bar,-2.017819,-0.701222
5,one,C,bar,0.098026,0.743434
6,two,A,foo,1.200625,0.291445
7,three,B,foo,1.706227,-1.839364
8,one,C,foo,0.005816,1.775501
9,one,A,bar,-0.847646,0.899179


In [75]:
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.847646,1.541496
one,B,-2.017819,1.924193
one,C,0.098026,0.005816
three,A,0.330156,
three,B,,1.706227
three,C,-0.834763,
two,A,,1.200625
two,B,-0.000388,
two,C,,1.236278
