# [10 Minutes to pandas — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/10min.html)


10분간 따라할 수 있는 판다스 튜토리얼 이지만 실제로는 1~2시간이 걸린다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [4]:
df2 = pd.DataFrame({ 'A' : 1.,
                'B' : pd.Timestamp('20130102'),
                'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                'D' : np.array([3] * 4,dtype='int32'),
                'E' : pd.Categorical(["test","train","test","train"]),
                'F' : 'foo' })


In [5]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.171297,0.237369,0.435772,1.3702
2013-01-02,-0.529336,0.73108,0.089469,0.587627
2013-01-03,-1.500064,-0.11046,0.632819,1.243177
2013-01-04,0.123926,-1.240683,0.47247,0.483528
2013-01-05,0.298983,-1.867886,-0.656975,0.970123


In [7]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.123926,-1.240683,0.47247,0.483528
2013-01-05,0.298983,-1.867886,-0.656975,0.970123
2013-01-06,0.303724,0.115404,-0.621257,-0.69175


In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.values

array([[-1.17129689,  0.2373689 ,  0.43577236,  1.37020035],
       [-0.52933621,  0.7310802 ,  0.08946879,  0.58762749],
       [-1.50006407, -0.11046016,  0.6328194 ,  1.2431769 ],
       [ 0.12392601, -1.2406834 ,  0.47246973,  0.48352823],
       [ 0.2989826 , -1.86788562, -0.65697475,  0.97012298],
       [ 0.30372355,  0.11540391, -0.62125675, -0.6917505 ]])

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.412344,-0.355863,0.058716,0.660484
std,0.784749,0.988413,0.568943,0.748901
min,-1.500064,-1.867886,-0.656975,-0.69175
25%,-1.010807,-0.958128,-0.443575,0.509553
50%,-0.202705,0.002472,0.262621,0.778875
75%,0.255218,0.206878,0.463295,1.174913
max,0.303724,0.73108,0.632819,1.3702


In [12]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.171297,-0.529336,-1.500064,0.123926,0.298983,0.303724
B,0.237369,0.73108,-0.11046,-1.240683,-1.867886,0.115404
C,0.435772,0.089469,0.632819,0.47247,-0.656975,-0.621257
D,1.3702,0.587627,1.243177,0.483528,0.970123,-0.69175


In [13]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.3702,0.435772,0.237369,-1.171297
2013-01-02,0.587627,0.089469,0.73108,-0.529336
2013-01-03,1.243177,0.632819,-0.11046,-1.500064
2013-01-04,0.483528,0.47247,-1.240683,0.123926
2013-01-05,0.970123,-0.656975,-1.867886,0.298983
2013-01-06,-0.69175,-0.621257,0.115404,0.303724


In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,0.298983,-1.867886,-0.656975,0.970123
2013-01-04,0.123926,-1.240683,0.47247,0.483528
2013-01-03,-1.500064,-0.11046,0.632819,1.243177
2013-01-06,0.303724,0.115404,-0.621257,-0.69175
2013-01-01,-1.171297,0.237369,0.435772,1.3702
2013-01-02,-0.529336,0.73108,0.089469,0.587627


# Selection

* .at, .iat, .loc, iloc, .ix

## Getting

In [15]:
df[['A']]

Unnamed: 0,A
2013-01-01,-1.171297
2013-01-02,-0.529336
2013-01-03,-1.500064
2013-01-04,0.123926
2013-01-05,0.298983
2013-01-06,0.303724


In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.171297,0.237369,0.435772,1.3702
2013-01-02,-0.529336,0.73108,0.089469,0.587627
2013-01-03,-1.500064,-0.11046,0.632819,1.243177


In [17]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.529336,0.73108,0.089469,0.587627
2013-01-03,-1.500064,-0.11046,0.632819,1.243177
2013-01-04,0.123926,-1.240683,0.47247,0.483528


## Selection by Label

In [18]:
df.loc[dates[0]]

A   -1.171297
B    0.237369
C    0.435772
D    1.370200
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.171297,0.237369
2013-01-02,-0.529336,0.73108
2013-01-03,-1.500064,-0.11046
2013-01-04,0.123926,-1.240683
2013-01-05,0.298983,-1.867886
2013-01-06,0.303724,0.115404


In [20]:
df.loc['20130101':'20130105', ['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.171297,0.237369
2013-01-02,-0.529336,0.73108
2013-01-03,-1.500064,-0.11046
2013-01-04,0.123926,-1.240683
2013-01-05,0.298983,-1.867886


In [21]:
df.loc['20130103', ['A','B']]

A   -1.500064
B   -0.110460
Name: 2013-01-03 00:00:00, dtype: float64

In [22]:
df.loc[dates[0], 'A']

-1.1712968881457537

In [23]:
df.at[dates[0], 'A']

-1.1712968881457537

### Selection by Position

In [24]:
df.iloc[3]

A    0.123926
B   -1.240683
C    0.472470
D    0.483528
Name: 2013-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.123926,-1.240683
2013-01-05,0.298983,-1.867886


In [26]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.529336,0.73108,0.089469,0.587627
2013-01-03,-1.500064,-0.11046,0.632819,1.243177


In [27]:
df.iloc[1,1]

0.73108019905756416

In [28]:
df.iat[1,1]  # 위 iloc와 같은 결과를 보여준다. 스칼라에? 좀 더 빠르게 접근한다.

0.73108019905756416

### Boolean Indexing

In [29]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.123926,-1.240683,0.47247,0.483528
2013-01-05,0.298983,-1.867886,-0.656975,0.970123
2013-01-06,0.303724,0.115404,-0.621257,-0.69175


In [30]:
df[df > 0] # 음수값은 NaN으로 출력 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,0.237369,0.435772,1.3702
2013-01-02,,0.73108,0.089469,0.587627
2013-01-03,,,0.632819,1.243177
2013-01-04,0.123926,,0.47247,0.483528
2013-01-05,0.298983,,,0.970123
2013-01-06,0.303724,0.115404,,


In [31]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.171297,0.237369,0.435772,1.3702,one
2013-01-02,-0.529336,0.73108,0.089469,0.587627,one
2013-01-03,-1.500064,-0.11046,0.632819,1.243177,two
2013-01-04,0.123926,-1.240683,0.47247,0.483528,three
2013-01-05,0.298983,-1.867886,-0.656975,0.970123,four
2013-01-06,0.303724,0.115404,-0.621257,-0.69175,three


### setting

In [32]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(df.shape)
s1

(6, 4)


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [33]:
df.at[dates[0],'A'] = 0 # setting values by label

In [34]:
df.iat[0, 1] = 0 # setting values by position
df.shape

(6, 4)

In [35]:
# setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) 
print(df.shape)
df

(6, 4)


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.435772,5
2013-01-02,-0.529336,0.73108,0.089469,5
2013-01-03,-1.500064,-0.11046,0.632819,5
2013-01-04,0.123926,-1.240683,0.47247,5
2013-01-05,0.298983,-1.867886,-0.656975,5
2013-01-06,0.303724,0.115404,-0.621257,5


In [36]:
# A where operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.435772,-5
2013-01-02,-0.529336,-0.73108,-0.089469,-5
2013-01-03,-1.500064,-0.11046,-0.632819,-5
2013-01-04,-0.123926,-1.240683,-0.47247,-5
2013-01-05,-0.298983,-1.867886,-0.656975,-5
2013-01-06,-0.303724,-0.115404,-0.621257,-5


# Missing Data

판다스는 np.nan으로 유실 된 데이터를 표현한다. 그리고 이 데이터는 계산에는 포함되지 않는다.
재인덱싱은 특정 축에 대해 변경/추가/삭제가 가능하다.

In [37]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.435772,5,1.0
2013-01-02,-0.529336,0.73108,0.089469,5,1.0
2013-01-03,-1.500064,-0.11046,0.632819,5,
2013-01-04,0.123926,-1.240683,0.47247,5,


In [38]:
# 유실 데이터가 있는 행을 드랍시킨다.
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.435772,5,1.0
2013-01-02,-0.529336,0.73108,0.089469,5,1.0


In [39]:
# 유실 데이터를 채워준다.
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.435772,5,1.0
2013-01-02,-0.529336,0.73108,0.089469,5,1.0
2013-01-03,-1.500064,-0.11046,0.632819,5,5.0
2013-01-04,0.123926,-1.240683,0.47247,5,5.0


In [40]:
# null 값 여부를 출력한다.
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Operations
* 연산과 관련 된 정보를 더 보고 싶으면 여기를 참고 [Essential Basic Functionality — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop)


In [41]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.435772,5
2013-01-02,-0.529336,0.73108,0.089469,5
2013-01-03,-1.500064,-0.11046,0.632819,5
2013-01-04,0.123926,-1.240683,0.47247,5
2013-01-05,0.298983,-1.867886,-0.656975,5
2013-01-06,0.303724,0.115404,-0.621257,5


In [42]:
# 연산에서는 일반적으로 유실 데이터를 제외한다.

df.mean()

A   -0.217128
B   -0.395424
C    0.058716
D    5.000000
dtype: float64

In [43]:
df.mean(1)

2013-01-01    1.358943
2013-01-02    1.322803
2013-01-03    1.005574
2013-01-04    1.088928
2013-01-05    0.693531
2013-01-06    1.199468
Freq: D, dtype: float64

In [44]:
# shift(n)을 하면 n만큼 row의 값이 밀린다.
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [45]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-2.500064,-1.11046,-0.367181,4.0
2013-01-04,-2.876074,-4.240683,-2.52753,2.0
2013-01-05,-4.701017,-6.867886,-5.656975,0.0
2013-01-06,,,,


## Apply

In [46]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.435772,5
2013-01-02,-0.529336,0.73108,0.525241,10
2013-01-03,-2.0294,0.62062,1.158061,15
2013-01-04,-1.905474,-0.620063,1.63053,20
2013-01-05,-1.606492,-2.487949,0.973556,25
2013-01-06,-1.302768,-2.372545,0.352299,30


In [47]:
df.apply(lambda x: x.max() - x.min())

A    1.803788
B    2.598966
C    1.289794
D    0.000000
dtype: float64

# Histogramming

In [48]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    2
1    2
2    0
3    5
4    2
5    1
6    5
7    3
8    5
9    6
dtype: int64

In [49]:
s.value_counts()

5    3
2    3
6    1
3    1
1    1
0    1
dtype: int64

## String Methods

* 문자열을 다룬다. 
* 정규표현식을 사용해서 패턴을 찾을 수도 있다.
* [Working with Text Data — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods) 이 링크로 문자열을 벡터로 다루는 법을 볼 수 있다.

In [50]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merge
## concat

In [51]:
# 10개의 행과 4개의 컬럼의 랜덤 숫자를 포함하는 데이터프레임을 생성한다.
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,1.24667,0.088893,-0.185146,-1.155954
1,0.685273,0.3345,-0.914456,1.314468
2,-0.391976,0.506705,-2.37199,-1.298524
3,0.167631,-0.255252,-0.279182,-0.076939
4,-0.386527,-0.778959,0.577352,1.608829
5,-0.314582,0.075195,-0.011158,0.336519
6,0.190238,1.842796,-0.036964,0.161802
7,-0.925128,-0.647585,1.318061,-1.342298
8,-0.026369,1.177315,0.08348,-1.090023
9,-1.029141,-1.226509,-1.062681,-1.275883


In [52]:
# 3개의 행, 3번 행부터 7번행 전까지, 7번행 이후로 조각을 나눈다.
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0  1.246670  0.088893 -0.185146 -1.155954
 1  0.685273  0.334500 -0.914456  1.314468
 2 -0.391976  0.506705 -2.371990 -1.298524,
           0         1         2         3
 3  0.167631 -0.255252 -0.279182 -0.076939
 4 -0.386527 -0.778959  0.577352  1.608829
 5 -0.314582  0.075195 -0.011158  0.336519
 6  0.190238  1.842796 -0.036964  0.161802,
           0         1         2         3
 7 -0.925128 -0.647585  1.318061 -1.342298
 8 -0.026369  1.177315  0.083480 -1.090023
 9 -1.029141 -1.226509 -1.062681 -1.275883]

In [53]:
# 다시 조각을 합친다.
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.24667,0.088893,-0.185146,-1.155954
1,0.685273,0.3345,-0.914456,1.314468
2,-0.391976,0.506705,-2.37199,-1.298524
3,0.167631,-0.255252,-0.279182,-0.076939
4,-0.386527,-0.778959,0.577352,1.608829
5,-0.314582,0.075195,-0.011158,0.336519
6,0.190238,1.842796,-0.036964,0.161802
7,-0.925128,-0.647585,1.318061,-1.342298
8,-0.026369,1.177315,0.08348,-1.090023
9,-1.029141,-1.226509,-1.062681,-1.275883


## Join
* SQL스타일로 머지하기
* 데이터 베이스 스타일로 머지하는 것은 여기를 참고 [Merge, join, and concatenate — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging-join) 

In [54]:
left = pd.DataFrame({'key' : ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'foo'], 'rval': [4, 5]})

In [55]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [56]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [57]:
# 키가 모두 foo 인 4행 2열의 데이터프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [58]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'bar'], 'rval': [4, 5]})

In [59]:
# key, lval, rval이 컬럼이 되어 2개 행인 데이터 프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Append
* 데이터프레임에 행을 추가하기

In [60]:
# 8개의 행과 4개의 컬럼을 갖는 랜덤 숫자를 생성한다. 컬럼의 이름은 A, B, C, D로 지정한다.
df = pd.DataFrame(np.random.randn(8,4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.48277,0.665296,0.036109,-0.400276
1,2.691549,1.291863,1.140365,0.095574
2,-0.873064,-0.366101,0.406981,1.09931
3,0.740408,0.158594,-0.019954,-0.128109
4,0.574225,-0.961075,1.169145,-0.078393
5,-0.369658,1.395994,-0.044725,0.83725
6,-1.051161,0.457346,1.12721,0.115534
7,-1.359393,-0.330404,0.887923,-0.671232


In [61]:
# index 3의 행을 가져온다.
s = df.iloc[3]
s

A    0.740408
B    0.158594
C   -0.019954
D   -0.128109
Name: 3, dtype: float64

In [62]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.48277,0.665296,0.036109,-0.400276
1,2.691549,1.291863,1.140365,0.095574
2,-0.873064,-0.366101,0.406981,1.09931
3,0.740408,0.158594,-0.019954,-0.128109
4,0.574225,-0.961075,1.169145,-0.078393
5,-0.369658,1.395994,-0.044725,0.83725
6,-1.051161,0.457346,1.12721,0.115534
7,-1.359393,-0.330404,0.887923,-0.671232
8,0.740408,0.158594,-0.019954,-0.128109


# Grouping
* splittitng
* Applying
* Combining

In [63]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                  'B' : ['one', 'one', 'two', 'three',
                        'two', 'two', 'one', 'three'],
                  'C' : np.random.randn(8),
                  'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.588318,-0.412007
1,bar,one,0.521847,-0.708368
2,foo,two,0.0888,-1.316128
3,bar,three,0.938517,0.246288
4,foo,two,1.682735,1.041123
5,bar,two,-0.633271,0.771563
6,foo,one,-0.591462,1.156996
7,foo,three,-0.631861,-1.329914


In [64]:
# A 컬럼의 값들이 groupby 되어 'foo','bar'에 대한 groupby 값이 나온다.
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.827093,0.309483
foo,1.136529,-0.85993


In [65]:
# B열의 one, two, three가 groupby 된 값이 나온다.
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.521847,-0.708368
bar,three,0.938517,0.246288
bar,two,-0.633271,0.771563
foo,one,-0.003144,0.74499
foo,three,-0.631861,-1.329914
foo,two,1.771535,-0.275005


# Reshaping

* [MultiIndex / Advanced Indexing — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced-hierarchical)

* [Reshaping and Pivot Tables — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-stacking)

# Stack

In [66]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 
                     'foo', 'foo', 'qux', 'qux'],
                   ['one', 'two', 'one', 'two', 
                    'one', 'two', 'one', 'two']]))

In [67]:
index = pd.MultiIndex.from_tuples(tuples, names = ['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns = ['A', 'B'] )

df2 = df[:4]

df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.149643,-0.707112
bar,two,1.841776,1.113916
baz,one,-0.087883,0.307561
baz,two,0.112949,0.668523


In [68]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.149643
               B   -0.707112
       two     A    1.841776
               B    1.113916
baz    one     A   -0.087883
               B    0.307561
       two     A    0.112949
               B    0.668523
dtype: float64

In [69]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.149643,-0.707112
bar,two,1.841776,1.113916
baz,one,-0.087883,0.307561
baz,two,0.112949,0.668523


In [70]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.149643,1.841776
bar,B,-0.707112,1.113916
baz,A,-0.087883,0.112949
baz,B,0.307561,0.668523


In [71]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.149643,-0.087883
one,B,-0.707112,0.307561
two,A,1.841776,0.112949
two,B,1.113916,0.668523


# Pivot Tables

In [72]:
df = pd.DataFrame({ 'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.987488,-0.565006
1,one,B,foo,0.620155,-0.741872
2,two,C,foo,1.180848,0.646415
3,three,A,bar,0.472856,0.72226
4,one,B,bar,-0.487398,-0.813455
5,one,C,bar,0.007605,0.409667
6,two,A,foo,-2.283505,1.553218
7,three,B,foo,-1.255301,0.106101
8,one,C,foo,1.175396,-0.527508
9,one,A,bar,-0.141993,-0.262335


In [73]:
# 엑셀 피봇테이블 기능이 pd 에도 있다.
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.141993,0.987488
one,B,-0.487398,0.620155
one,C,0.007605,1.175396
three,A,0.472856,
three,B,,-1.255301
three,C,0.151362,
two,A,,-2.283505
two,B,0.709172,
two,C,,1.180848


# Time Series

판다스는 frequency conversion 중에 리샘플링 작업을 위한 간단하고, 강력하며, 효율적인 기능을 제공한다. (예를 들어 5분간의 데이터를 초데이터로 변환하는 작업)

In [74]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')
# rng
# 2012-01-01 00:00:00 ~ 2012-01-01 00:01:39 까지 100분간의 데이터를 생성

In [75]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min').sum()

2012-01-01    23928
Freq: 5T, dtype: int64

In [76]:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')