<a href="https://colab.research.google.com/github/hanna-joo/bigdata_edu/blob/master/python_edu/pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[10 minutes to pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html)

In [None]:
import numpy as np
import pandas as pd

# 1. 변수 생성 (Object creation)

In [None]:
# Series 생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
# DataFrame 생성: Numpy 행렬
dates = pd.date_range('20130101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2013-01-01,-0.277851,0.860671,1.088026,-1.933634
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-05,-1.16639,-0.409825,0.556258,0.365038
2013-01-06,0.252645,0.106958,-0.615045,1.335791


In [None]:
# DataFrame 생성: 딕셔너리
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# 2. 데이터 확인 (Viewing data)


In [None]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,-0.277851,0.860671,1.088026,-1.933634
2013-01-02,1.946802,-0.62939,1.236082,-1.358554


In [None]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
# Numpy 행렬에는 단 한가지의 dtype만 가능
# Pandas 데이터 프레임에는 컬럼당 한가지의 dtype만 가능
df.to_numpy()

array([[-0.2778514 ,  0.86067105,  1.08802642, -1.93363369],
       [ 1.94680189, -0.62939043,  1.23608178, -1.35855402],
       [-0.94669542, -0.20577029,  2.4961201 , -0.48806319],
       [-1.10942325,  0.72118856, -0.63611922,  0.29446835],
       [-1.16638974, -0.4098251 ,  0.55625792,  0.36503844],
       [ 0.25264505,  0.1069578 , -0.61504474,  1.33579106]])

In [None]:
# 여러 dtype이 존재하는 df2를 numpy로 변환 시 모든 것을 아우르는 dtype 선택
  # DataFrame.to_numpy(): 인덱스와 컬럼명은 포함되지 않음
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [None]:
# df의 통계 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.216819,0.073972,0.687554,-0.297492
std,1.194699,0.607603,1.199857,1.207982
min,-1.16639,-0.62939,-0.636119,-1.933634
25%,-1.068741,-0.358811,-0.322219,-1.140931
50%,-0.612273,-0.049406,0.822142,-0.096797
75%,0.120021,0.567631,1.199068,0.347396
max,1.946802,0.860671,2.49612,1.335791


In [None]:
# df의 행과 열 바꾸기
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.277851,1.946802,-0.946695,-1.109423,-1.16639,0.252645
B,0.860671,-0.62939,-0.20577,0.721189,-0.409825,0.106958
C,1.088026,1.236082,2.49612,-0.636119,0.556258,-0.615045
D,-1.933634,-1.358554,-0.488063,0.294468,0.365038,1.335791


In [None]:
# 행 또는 열 값에 따라 나열하기
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.933634,1.088026,0.860671,-0.277851
2013-01-02,-1.358554,1.236082,-0.62939,1.946802
2013-01-03,-0.488063,2.49612,-0.20577,-0.946695
2013-01-04,0.294468,-0.636119,0.721189,-1.109423
2013-01-05,0.365038,0.556258,-0.409825,-1.16639
2013-01-06,1.335791,-0.615045,0.106958,0.252645


In [None]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.252645,0.106958,-0.615045,1.335791
2013-01-05,-1.16639,-0.409825,0.556258,0.365038
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-01,-0.277851,0.860671,1.088026,-1.933634


In [None]:
# 값에 따라 나열하기
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-05,-1.16639,-0.409825,0.556258,0.365038
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-06,0.252645,0.106958,-0.615045,1.335791
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-01,-0.277851,0.860671,1.088026,-1.933634


In [None]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-01,-0.277851,0.860671,1.088026,-1.933634
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-06,0.252645,0.106958,-0.615045,1.335791
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-05,-1.16639,-0.409825,0.556258,0.365038
2013-01-02,1.946802,-0.62939,1.236082,-1.358554


# 3. 데이터 선택 (Selection)
- .at / .iat
- .loc / .iloc
- .isin

In [None]:
# 해당하는 열 가져오기
df['A']  # df.A

2013-01-01   -0.277851
2013-01-02    1.946802
2013-01-03   -0.946695
2013-01-04   -1.109423
2013-01-05   -1.166390
2013-01-06    0.252645
Freq: D, Name: A, dtype: float64

In [None]:
# 해당하는 행 가져오기
df[2:4]  # df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-04,-1.109423,0.721189,-0.636119,0.294468


## 인덱스 또는 컬럼명으로 가져오기 (.loc)

In [None]:
# 인덱스 값으로 가져오기
df.loc['20130101']

A   -0.277851
B    0.860671
C    1.088026
D   -1.933634
Name: 2013-01-01 00:00:00, dtype: float64

In [None]:
# 지정한 행과 열에 해당하는 값 가져오기
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.277851,0.860671
2013-01-02,1.946802,-0.62939
2013-01-03,-0.946695,-0.20577
2013-01-04,-1.109423,0.721189
2013-01-05,-1.16639,-0.409825
2013-01-06,0.252645,0.106958


In [None]:
df.loc['20130102':'20130104', 'A':'C']

Unnamed: 0,A,B,C
2013-01-02,1.946802,-0.62939,1.236082
2013-01-03,-0.946695,-0.20577,2.49612
2013-01-04,-1.109423,0.721189,-0.636119


In [None]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,1.946802,1.236082
2013-01-03,-0.946695,2.49612
2013-01-04,-1.109423,-0.636119


In [None]:
df.loc[dates[0],'A']

-0.2778513990228104

In [None]:
# .at: scalar 가져올 때 .loc 보다 더 빠른 방법
df.at[dates[0],'A']

-0.2778513990228104

## 좌표값으로 가져오기 (.iloc)

In [None]:
df.iloc[2]

A   -0.946695
B   -0.205770
C    2.496120
D   -0.488063
Name: 2013-01-03 00:00:00, dtype: float64

In [None]:
df.iloc[3:5]  # df[3:5] / df['20130104':'20130105']

Unnamed: 0,A,B,C,D
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-05,-1.16639,-0.409825,0.556258,0.365038


In [None]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.109423,0.721189
2013-01-05,-1.16639,-0.409825


In [None]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.946802,1.236082
2013-01-03,-0.946695,2.49612
2013-01-05,-1.16639,0.556258


In [None]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063


In [None]:
df.iloc[1,1]

-0.6293904272806885

In [None]:
# .iat: scalar 가져올 때 .iloc 보다 더 빠른 방법
df.iat[1,1]

-0.6293904272806885

## 조건에 해당하는 값 가져오기 (.isin)

In [None]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-06,0.252645,0.106958,-0.615045,1.335791


In [None]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.860671,1.088026,
2013-01-02,1.946802,,1.236082,
2013-01-03,,,2.49612,
2013-01-04,,0.721189,,0.294468
2013-01-05,,,0.556258,0.365038
2013-01-06,0.252645,0.106958,,1.335791


In [None]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.277851,0.860671,1.088026,-1.933634,one
2013-01-02,1.946802,-0.62939,1.236082,-1.358554,one
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063,two
2013-01-04,-1.109423,0.721189,-0.636119,0.294468,three
2013-01-05,-1.16639,-0.409825,0.556258,0.365038,four
2013-01-06,0.252645,0.106958,-0.615045,1.335791,three


In [None]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063,two
2013-01-05,-1.16639,-0.409825,0.556258,0.365038,four


## 값 설정하기

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [None]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.277851,0.860671,1.088026,-1.933634
2013-01-02,1.946802,-0.62939,1.236082,-1.358554
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063
2013-01-04,-1.109423,0.721189,-0.636119,0.294468
2013-01-05,-1.16639,-0.409825,0.556258,0.365038
2013-01-06,0.252645,0.106958,-0.615045,1.335791


In [None]:
# 새로운 컬럼 추가(인덱스에 해당하는 값만 가져옴)
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-0.277851,0.860671,1.088026,-1.933634,
2013-01-02,1.946802,-0.62939,1.236082,-1.358554,1.0
2013-01-03,-0.946695,-0.20577,2.49612,-0.488063,2.0
2013-01-04,-1.109423,0.721189,-0.636119,0.294468,3.0
2013-01-05,-1.16639,-0.409825,0.556258,0.365038,4.0
2013-01-06,0.252645,0.106958,-0.615045,1.335791,5.0


In [None]:
# 새로운 값으로 바꾸기
df.at[dates[0],'A'] = 0
df.iat[0, 1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.088026,5,
2013-01-02,1.946802,-0.62939,1.236082,5,1.0
2013-01-03,-0.946695,-0.20577,2.49612,5,2.0
2013-01-04,-1.109423,0.721189,-0.636119,5,3.0
2013-01-05,-1.16639,-0.409825,0.556258,5,4.0
2013-01-06,0.252645,0.106958,-0.615045,5,5.0


In [None]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.088026,-5,
2013-01-02,-1.946802,-0.62939,-1.236082,-5,-1.0
2013-01-03,-0.946695,-0.20577,-2.49612,-5,-2.0
2013-01-04,-1.109423,-0.721189,-0.636119,-5,-3.0
2013-01-05,-1.16639,-0.409825,-0.556258,-5,-4.0
2013-01-06,-0.252645,-0.106958,-0.615045,-5,-5.0


# 4. 결측치 처리 (Missing data)

In [None]:
# 인덱스 변경/추가/삭제: reindex
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.088026,5,,1.0
2013-01-02,1.946802,-0.62939,1.236082,5,1.0,1.0
2013-01-03,-0.946695,-0.20577,2.49612,5,2.0,
2013-01-04,-1.109423,0.721189,-0.636119,5,3.0,


In [None]:
# 결측치 있는 행 모두 삭제
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1.946802,-0.62939,1.236082,5,1.0,1.0


In [None]:
# 모든 결측치 값에 5 넣기
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,1.088026,5,5.0,1.0
2013-01-02,1.946802,-0.62939,1.236082,5,1.0,1.0
2013-01-03,-0.946695,-0.20577,2.49612,5,2.0,5.0
2013-01-04,-1.109423,0.721189,-0.636119,5,3.0,5.0


In [None]:
# 특정 컬럼의 결측치 값에 5 넣기
df1.F.fillna(value=5)

2013-01-01    5.0
2013-01-02    1.0
2013-01-03    2.0
2013-01-04    3.0
Freq: D, Name: F, dtype: float64

In [None]:
# 결측치 있는지 없는지 위치 확인하기
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 5. 조작 (Operations)

## 데이터 프레임의 기본 통계 구하기

In [None]:
print(df.mean())  # 컬럼별 평균
print(df.mean(1)) # 행별 평균

A   -0.170510
B   -0.069473
C    0.687554
D    5.000000
F    3.000000
dtype: float64
2013-01-01    1.522007
2013-01-02    1.710699
2013-01-03    1.668731
2013-01-04    1.395129
2013-01-05    1.596009
2013-01-06    1.948912
Freq: D, dtype: float64


In [None]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [None]:
print(df)
df.sub(s, axis='index')

                   A         B         C  D    F
2013-01-01  0.000000  0.000000  1.088026  5  NaN
2013-01-02  1.946802 -0.629390  1.236082  5  1.0
2013-01-03 -0.946695 -0.205770  2.496120  5  2.0
2013-01-04 -1.109423  0.721189 -0.636119  5  3.0
2013-01-05 -1.166390 -0.409825  0.556258  5  4.0
2013-01-06  0.252645  0.106958 -0.615045  5  5.0


Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-1.946695,-1.20577,1.49612,4.0,1.0
2013-01-04,-4.109423,-2.278811,-3.636119,2.0,0.0
2013-01-05,-6.16639,-5.409825,-4.443742,0.0,-1.0
2013-01-06,,,,,


## 데이터 프레임에 함수 적용하기: df.apply()

In [None]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.088026,5,
2013-01-02,1.946802,-0.62939,1.236082,5,1.0
2013-01-03,-0.946695,-0.20577,2.49612,5,2.0
2013-01-04,-1.109423,0.721189,-0.636119,5,3.0
2013-01-05,-1.16639,-0.409825,0.556258,5,4.0
2013-01-06,0.252645,0.106958,-0.615045,5,5.0


In [None]:
# df의 컬럼별로 누적합 구하기
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,1.088026,5,
2013-01-02,1.946802,-0.62939,2.324108,10,1.0
2013-01-03,1.000106,-0.835161,4.820228,15,3.0
2013-01-04,-0.109317,-0.113972,4.184109,20,6.0
2013-01-05,-1.275707,-0.523797,4.740367,25,10.0
2013-01-06,-1.023061,-0.416839,4.125322,30,15.0


In [None]:
# df의 컬럼별로 lambda함수 적용하기
df.apply(lambda x:x.max()-x.min())

A    3.113192
B    1.350579
C    3.132239
D    0.000000
F    4.000000
dtype: float64

## 열의 value 개수 세기: s.value_counts()

In [None]:
s = pd.Series(np.random.randint(0,7,size=10))
s

0    4
1    4
2    5
3    4
4    4
5    6
6    3
7    5
8    3
9    2
dtype: int64

In [None]:
s.value_counts()

4    4
5    2
3    2
6    1
2    1
dtype: int64

## 문자열 조작하기

In [None]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [None]:
# 대문자 > 소문자로 만들기
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 6. 합치기 (Merge)
- pd.concat()
- pd.merge()

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))
print(df)
# 데이터 프레임 여러 조각으로 쪼개기
pieces = [df[:3],df[3:7],df[7:]]
pd.concat(pieces)

          0         1         2         3
0 -0.290037 -0.352691 -0.013832  0.039879
1  0.565060 -0.417012 -0.766269 -0.436266
2 -0.158984 -0.397452 -0.579108 -2.071592
3  0.890815  0.872739  0.106786 -0.855924
4  0.642836 -1.291229  0.100336  2.389512
5 -1.459743  0.124338  0.127871 -2.006707
6  0.836864  0.103717  1.138092  0.160308
7 -0.887658  0.585038  0.278150  0.918716
8 -0.791738  2.045596 -1.429301  0.624231
9  0.745012 -1.430064  2.169319 -2.001200


Unnamed: 0,0,1,2,3
0,-0.290037,-0.352691,-0.013832,0.039879
1,0.56506,-0.417012,-0.766269,-0.436266
2,-0.158984,-0.397452,-0.579108,-2.071592
3,0.890815,0.872739,0.106786,-0.855924
4,0.642836,-1.291229,0.100336,2.389512
5,-1.459743,0.124338,0.127871,-2.006707
6,0.836864,0.103717,1.138092,0.160308
7,-0.887658,0.585038,0.27815,0.918716
8,-0.791738,2.045596,-1.429301,0.624231
9,0.745012,-1.430064,2.169319,-2.0012


In [None]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
print(left,'\n')
print(right,'\n')
pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  foo     2 

   key  rval
0  foo     4
1  foo     5 



Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [None]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
print(left,'\n')
print(right,'\n')
pd.merge(left, right, on='key')

   key  lval
0  foo     1
1  bar     2 

   key  rval
0  foo     4
1  bar     5 



Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# 7. 그룹화하기 (Grouping)
- step1: 데이터 쪼개기
- step2: 각 그룹별로 함수 적용하기
- step3: 결과 결합하기

In [None]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                        'foo', 'bar', 'foo', 'foo'],
                  'B': ['one', 'one', 'two', 'three',
                        'two', 'two', 'one', 'three'],
                  'C': np.random.randn(8),
                  'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,1.153266,0.860959
1,bar,one,-0.798814,-0.46177
2,foo,two,-0.249175,2.034137
3,bar,three,-0.410769,-2.396592
4,foo,two,-0.927034,-0.429109
5,bar,two,1.321037,0.038596
6,foo,one,-1.026243,-1.627693
7,foo,three,-0.528087,-0.440169


In [None]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.111454,-2.819766
foo,-1.577273,0.398125


In [None]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.798814,-0.46177
bar,three,-0.410769,-2.396592
bar,two,1.321037,0.038596
foo,one,0.127023,-0.766733
foo,three,-0.528087,-0.440169
foo,two,-1.176209,1.605027


# 8. 재구조화하기 (Reshaping)

## df.stack()

In [None]:
# 인덱스 생성하기
tuples = list(zip(*[['bar','bar','baz','baz',
                     'foo','foo','qux','qux'],
                    ['one','two','one','two',
                     'one','two','one','two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [None]:
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A','B'])
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.130463,0.491396
bar,two,-0.078867,0.534167
baz,one,0.327464,0.135092
baz,two,-2.32252,1.859768


In [None]:
# 데이터프레임 시리즈로 압축하기
stacked = df2.stack()
stacked  # MultiIndex 가지고 있음

first  second   
bar    one     A   -1.130463
               B    0.491396
       two     A   -0.078867
               B    0.534167
baz    one     A    0.327464
               B    0.135092
       two     A   -2.322520
               B    1.859768
dtype: float64

In [None]:
# 압축 풀기
stacked.unstack()  # default는 마지막 수준을 unstack하기

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.130463,0.491396
bar,two,-0.078867,0.534167
baz,one,0.327464,0.135092
baz,two,-2.32252,1.859768


In [None]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-1.130463,-0.078867
bar,B,0.491396,0.534167
baz,A,0.327464,-2.32252
baz,B,0.135092,1.859768


In [None]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-1.130463,0.327464
one,B,0.491396,0.135092
two,A,-0.078867,-2.32252
two,B,0.534167,1.859768


## pd.pivot_table()

In [None]:
df = pd.DataFrame({'A':['one','one','two','three']*3,
                   'B':['A','B','C']*4,
                   'C':['foo','foo','foo','bar','bar','bar']*2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1.451264,-0.496312
1,one,B,foo,1.310091,-0.443941
2,two,C,foo,0.74603,-0.021768
3,three,A,bar,0.008831,-0.257295
4,one,B,bar,0.159625,-0.783185
5,one,C,bar,-0.077481,0.34314
6,two,A,foo,-1.325068,1.165113
7,three,B,foo,-0.330153,-0.529142
8,one,C,foo,-1.804339,-1.157359
9,one,A,bar,0.318181,-1.403317


In [None]:
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.318181,1.451264
one,B,0.159625,1.310091
one,C,-0.077481,-1.804339
three,A,0.008831,
three,B,,-0.330153
three,C,-1.35086,
two,A,,-1.325068
two,B,0.315542,
two,C,,0.74603


# 9. 시계열 (Time series)

In [None]:
# 초 단위로 날짜 인덱스 100개 생성하기
rng = pd.date_range('1/1/2012', periods=100, freq='S')
# 0부터 500까지 숫자 중에서 rng의 길이만큼 랜덤으로 골라 값 부여하기
ts = pd.Series(np.random.randint(0,500,len(rng)), index=rng)
ts

2012-01-01 00:00:00    294
2012-01-01 00:00:01    213
2012-01-01 00:00:02    157
2012-01-01 00:00:03    128
2012-01-01 00:00:04    465
                      ... 
2012-01-01 00:01:35    455
2012-01-01 00:01:36    387
2012-01-01 00:01:37    229
2012-01-01 00:01:38     22
2012-01-01 00:01:39    233
Freq: S, Length: 100, dtype: int64

In [None]:
ts.resample('5Min').sum()

2012-01-01    24909
Freq: 5T, dtype: int64