# [10 Minutes to pandas — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/10min.html)


10분간 따라할 수 있는 판다스 튜토리얼 이지만 실제로는 1~2시간이 걸린다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [4]:
df2 = pd.DataFrame({ 'A' : 1.,
                'B' : pd.Timestamp('20130102'),
                'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                'D' : np.array([3] * 4,dtype='int32'),
                'E' : pd.Categorical(["test","train","test","train"]),
                'F' : 'foo' })


In [5]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.831346,1.131317,-0.644623,-1.083904
2013-01-02,0.023321,1.31056,-0.91367,-1.162896
2013-01-03,0.139391,0.576612,-1.071154,1.358014
2013-01-04,-0.898553,1.285884,0.224282,-0.226396
2013-01-05,-1.911396,-2.004834,-1.703097,-0.361429


In [7]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.898553,1.285884,0.224282,-0.226396
2013-01-05,-1.911396,-2.004834,-1.703097,-0.361429
2013-01-06,0.321052,1.161307,0.773849,-1.011588


In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.values

array([[-0.83134633,  1.13131703, -0.64462337, -1.08390375],
       [ 0.02332144,  1.31056011, -0.91367028, -1.16289628],
       [ 0.13939097,  0.57661214, -1.07115435,  1.35801356],
       [-0.8985531 ,  1.28588421,  0.22428151, -0.22639615],
       [-1.91139624, -2.00483385, -1.70309656, -0.36142893],
       [ 0.32105217,  1.16130725,  0.77384882, -1.01158814]])

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.526255,0.576808,-0.555736,-0.4147
std,0.8501,1.292685,0.904913,0.953385
min,-1.911396,-2.004834,-1.703097,-1.162896
25%,-0.881751,0.715288,-1.031783,-1.065825
50%,-0.404012,1.146312,-0.779147,-0.686509
75%,0.110374,1.25474,0.007055,-0.260154
max,0.321052,1.31056,0.773849,1.358014


In [12]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.831346,0.023321,0.139391,-0.898553,-1.911396,0.321052
B,1.131317,1.31056,0.576612,1.285884,-2.004834,1.161307
C,-0.644623,-0.91367,-1.071154,0.224282,-1.703097,0.773849
D,-1.083904,-1.162896,1.358014,-0.226396,-0.361429,-1.011588


In [13]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.083904,-0.644623,1.131317,-0.831346
2013-01-02,-1.162896,-0.91367,1.31056,0.023321
2013-01-03,1.358014,-1.071154,0.576612,0.139391
2013-01-04,-0.226396,0.224282,1.285884,-0.898553
2013-01-05,-0.361429,-1.703097,-2.004834,-1.911396
2013-01-06,-1.011588,0.773849,1.161307,0.321052


In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,-1.911396,-2.004834,-1.703097,-0.361429
2013-01-03,0.139391,0.576612,-1.071154,1.358014
2013-01-01,-0.831346,1.131317,-0.644623,-1.083904
2013-01-06,0.321052,1.161307,0.773849,-1.011588
2013-01-04,-0.898553,1.285884,0.224282,-0.226396
2013-01-02,0.023321,1.31056,-0.91367,-1.162896


# Selection

* .at, .iat, .loc, iloc, .ix

## Getting

In [15]:
df[['A']]

Unnamed: 0,A
2013-01-01,-0.831346
2013-01-02,0.023321
2013-01-03,0.139391
2013-01-04,-0.898553
2013-01-05,-1.911396
2013-01-06,0.321052


In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.831346,1.131317,-0.644623,-1.083904
2013-01-02,0.023321,1.31056,-0.91367,-1.162896
2013-01-03,0.139391,0.576612,-1.071154,1.358014


In [17]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.023321,1.31056,-0.91367,-1.162896
2013-01-03,0.139391,0.576612,-1.071154,1.358014
2013-01-04,-0.898553,1.285884,0.224282,-0.226396


## Selection by Label

In [18]:
df.loc[dates[0]]

A   -0.831346
B    1.131317
C   -0.644623
D   -1.083904
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.831346,1.131317
2013-01-02,0.023321,1.31056
2013-01-03,0.139391,0.576612
2013-01-04,-0.898553,1.285884
2013-01-05,-1.911396,-2.004834
2013-01-06,0.321052,1.161307


In [20]:
df.loc['20130101':'20130105', ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.831346,1.131317
2013-01-02,0.023321,1.31056
2013-01-03,0.139391,0.576612
2013-01-04,-0.898553,1.285884
2013-01-05,-1.911396,-2.004834


In [21]:
df.loc['20130103', ['A','B']]

A    0.139391
B    0.576612
Name: 2013-01-03 00:00:00, dtype: float64

In [22]:
df.loc[dates[0], 'A']

-0.83134632621585325

In [23]:
df.at[dates[0], 'A']

-0.83134632621585325

### Selection by Position

In [24]:
df.iloc[3]

A   -0.898553
B    1.285884
C    0.224282
D   -0.226396
Name: 2013-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.898553,1.285884
2013-01-05,-1.911396,-2.004834


In [26]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.023321,1.31056,-0.91367,-1.162896
2013-01-03,0.139391,0.576612,-1.071154,1.358014


In [27]:
df.iloc[1,1]

1.3105601097318695

In [28]:
df.iat[1,1]  # 위 iloc와 같은 결과를 보여준다. 스칼라에? 좀 더 빠르게 접근한다.

1.3105601097318695

### Boolean Indexing

In [29]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.023321,1.31056,-0.91367,-1.162896
2013-01-03,0.139391,0.576612,-1.071154,1.358014
2013-01-06,0.321052,1.161307,0.773849,-1.011588


In [30]:
df[df > 0] # 음수값은 NaN으로 출력 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,1.131317,,
2013-01-02,0.023321,1.31056,,
2013-01-03,0.139391,0.576612,,1.358014
2013-01-04,,1.285884,0.224282,
2013-01-05,,,,
2013-01-06,0.321052,1.161307,0.773849,


In [31]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.831346,1.131317,-0.644623,-1.083904,one
2013-01-02,0.023321,1.31056,-0.91367,-1.162896,one
2013-01-03,0.139391,0.576612,-1.071154,1.358014,two
2013-01-04,-0.898553,1.285884,0.224282,-0.226396,three
2013-01-05,-1.911396,-2.004834,-1.703097,-0.361429,four
2013-01-06,0.321052,1.161307,0.773849,-1.011588,three


### setting

In [32]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(df.shape)
s1

(6, 4)


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [33]:
df.at[dates[0],'A'] = 0 # setting values by label

In [34]:
df.iat[0, 1] = 0 # setting values by position
df.shape

(6, 4)

In [35]:
# setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) 
print(df.shape)
df

(6, 4)


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.644623,5
2013-01-02,0.023321,1.31056,-0.91367,5
2013-01-03,0.139391,0.576612,-1.071154,5
2013-01-04,-0.898553,1.285884,0.224282,5
2013-01-05,-1.911396,-2.004834,-1.703097,5
2013-01-06,0.321052,1.161307,0.773849,5


In [36]:
# A where operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.644623,-5
2013-01-02,-0.023321,-1.31056,-0.91367,-5
2013-01-03,-0.139391,-0.576612,-1.071154,-5
2013-01-04,-0.898553,-1.285884,-0.224282,-5
2013-01-05,-1.911396,-2.004834,-1.703097,-5
2013-01-06,-0.321052,-1.161307,-0.773849,-5


# Missing Data

판다스는 np.nan으로 유실 된 데이터를 표현한다. 그리고 이 데이터는 계산에는 포함되지 않는다.
재인덱싱은 특정 축에 대해 변경/추가/삭제가 가능하다.

In [37]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.644623,5,1.0
2013-01-02,0.023321,1.31056,-0.91367,5,1.0
2013-01-03,0.139391,0.576612,-1.071154,5,
2013-01-04,-0.898553,1.285884,0.224282,5,


In [38]:
# 유실 데이터가 있는 행을 드랍시킨다.
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.644623,5,1.0
2013-01-02,0.023321,1.31056,-0.91367,5,1.0


In [39]:
# 유실 데이터를 채워준다.
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.644623,5,1.0
2013-01-02,0.023321,1.31056,-0.91367,5,1.0
2013-01-03,0.139391,0.576612,-1.071154,5,5.0
2013-01-04,-0.898553,1.285884,0.224282,5,5.0


In [40]:
# null 값 여부를 출력한다.
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Operations
* 연산과 관련 된 정보를 더 보고 싶으면 여기를 참고 [Essential Basic Functionality — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop)


In [41]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.644623,5
2013-01-02,0.023321,1.31056,-0.91367,5
2013-01-03,0.139391,0.576612,-1.071154,5
2013-01-04,-0.898553,1.285884,0.224282,5
2013-01-05,-1.911396,-2.004834,-1.703097,5
2013-01-06,0.321052,1.161307,0.773849,5


In [42]:
# 연산에서는 일반적으로 유실 데이터를 제외한다.

df.mean()

A   -0.387697
B    0.388255
C   -0.555736
D    5.000000
dtype: float64

In [43]:
df.mean(1)

2013-01-01    1.088844
2013-01-02    1.355053
2013-01-03    1.161212
2013-01-04    1.402903
2013-01-05   -0.154832
2013-01-06    1.814052
Freq: D, dtype: float64

In [44]:
# shift(n)을 하면 n만큼 row의 값이 밀린다.
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [45]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-0.860609,-0.423388,-2.071154,4.0
2013-01-04,-3.898553,-1.714116,-2.775718,2.0
2013-01-05,-6.911396,-7.004834,-6.703097,0.0
2013-01-06,,,,


## Apply

In [46]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.644623,5
2013-01-02,0.023321,1.31056,-1.558294,10
2013-01-03,0.162712,1.887172,-2.629448,15
2013-01-04,-0.735841,3.173056,-2.405166,20
2013-01-05,-2.647237,1.168223,-4.108263,25
2013-01-06,-2.326185,2.32953,-3.334414,30


In [47]:
df.apply(lambda x: x.max() - x.min())

A    2.232448
B    3.315394
C    2.476945
D    0.000000
dtype: float64

# Histogramming

In [48]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    6
1    0
2    2
3    4
4    2
5    3
6    0
7    4
8    1
9    0
dtype: int64

In [49]:
s.value_counts()

0    3
4    2
2    2
6    1
3    1
1    1
dtype: int64

## String Methods

* 문자열을 다룬다. 
* 정규표현식을 사용해서 패턴을 찾을 수도 있다.
* [Working with Text Data — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods) 이 링크로 문자열을 벡터로 다루는 법을 볼 수 있다.

In [50]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merge
## concat

In [53]:
# 10개의 행과 4개의 컬럼의 랜덤 숫자를 포함하는 데이터프레임을 생성한다.
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-0.154392,-0.892224,-1.709195,-0.073984
1,-0.58767,1.278156,0.819199,-1.273558
2,-2.09053,-0.194598,1.058697,-1.211327
3,1.431896,0.991882,1.460855,0.395421
4,0.37447,0.062973,-0.310915,-0.743565
5,0.084787,-0.2755,3.030355,0.727613
6,0.059163,-0.971908,0.317296,1.055281
7,1.532477,1.123355,-0.013156,-0.824159
8,1.033985,0.101129,0.131801,0.331895
9,-1.870278,0.950404,-1.358001,1.703264


In [57]:
# 3개의 행, 3번 행부터 7번행 전까지, 7번행 이후로 조각을 나눈다.
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -0.154392 -0.892224 -1.709195 -0.073984
 1 -0.587670  1.278156  0.819199 -1.273558
 2 -2.090530 -0.194598  1.058697 -1.211327,
           0         1         2         3
 3  1.431896  0.991882  1.460855  0.395421
 4  0.374470  0.062973 -0.310915 -0.743565
 5  0.084787 -0.275500  3.030355  0.727613
 6  0.059163 -0.971908  0.317296  1.055281,
           0         1         2         3
 7  1.532477  1.123355 -0.013156 -0.824159
 8  1.033985  0.101129  0.131801  0.331895
 9 -1.870278  0.950404 -1.358001  1.703264]

In [59]:
# 다시 조각을 합친다.
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.154392,-0.892224,-1.709195,-0.073984
1,-0.58767,1.278156,0.819199,-1.273558
2,-2.09053,-0.194598,1.058697,-1.211327
3,1.431896,0.991882,1.460855,0.395421
4,0.37447,0.062973,-0.310915,-0.743565
5,0.084787,-0.2755,3.030355,0.727613
6,0.059163,-0.971908,0.317296,1.055281
7,1.532477,1.123355,-0.013156,-0.824159
8,1.033985,0.101129,0.131801,0.331895
9,-1.870278,0.950404,-1.358001,1.703264


## Join
* SQL스타일로 머지하기
* 데이터 베이스 스타일로 머지하는 것은 여기를 참고 [Merge, join, and concatenate — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging-join) 

In [73]:
left = pd.DataFrame({'key' : ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'foo'], 'rval': [4, 5]})

In [74]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [75]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [76]:
# 키가 모두 foo 인 4행 2열의 데이터프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [77]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'bar'], 'rval': [4, 5]})

In [78]:
# key, lval, rval이 컬럼이 되어 2개 행인 데이터 프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Append