# [10 Minutes to pandas — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/10min.html)


10분간 따라할 수 있는 판다스 튜토리얼 이지만 실제로는 1~2시간이 걸린다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [4]:
df2 = pd.DataFrame({ 'A' : 1.,
                'B' : pd.Timestamp('20130102'),
                'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                'D' : np.array([3] * 4,dtype='int32'),
                'E' : pd.Categorical(["test","train","test","train"]),
                'F' : 'foo' })


In [5]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.34656,-0.166006,0.949853,-0.898606
2013-01-02,3.240153,2.308688,-0.13555,1.315562
2013-01-03,-0.420369,-0.435625,-0.399588,-1.177538
2013-01-04,-0.263811,1.501158,-0.935076,0.535883
2013-01-05,-0.133102,1.470813,-0.040805,-1.364109


In [7]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.263811,1.501158,-0.935076,0.535883
2013-01-05,-0.133102,1.470813,-0.040805,-1.364109
2013-01-06,1.339214,1.625696,-0.877307,0.938339


In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.values

array([[ 0.34655985, -0.1660057 ,  0.94985306, -0.89860617],
       [ 3.24015311,  2.3086885 , -0.13555004,  1.31556221],
       [-0.42036908, -0.43562459, -0.39958826, -1.1775376 ],
       [-0.2638105 ,  1.50115836, -0.93507611,  0.53588267],
       [-0.13310195,  1.47081282, -0.04080516, -1.36410875],
       [ 1.33921383,  1.62569644, -0.87730736,  0.93833938]])

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.684774,1.050788,-0.239746,-0.108411
std,1.404473,1.09381,0.690087,1.173262
min,-0.420369,-0.435625,-0.935076,-1.364109
25%,-0.231133,0.243199,-0.757878,-1.107805
50%,0.106729,1.485986,-0.267569,-0.181362
75%,1.09105,1.594562,-0.064491,0.837725
max,3.240153,2.308688,0.949853,1.315562


In [12]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.34656,3.240153,-0.420369,-0.263811,-0.133102,1.339214
B,-0.166006,2.308688,-0.435625,1.501158,1.470813,1.625696
C,0.949853,-0.13555,-0.399588,-0.935076,-0.040805,-0.877307
D,-0.898606,1.315562,-1.177538,0.535883,-1.364109,0.938339


In [13]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.898606,0.949853,-0.166006,0.34656
2013-01-02,1.315562,-0.13555,2.308688,3.240153
2013-01-03,-1.177538,-0.399588,-0.435625,-0.420369
2013-01-04,0.535883,-0.935076,1.501158,-0.263811
2013-01-05,-1.364109,-0.040805,1.470813,-0.133102
2013-01-06,0.938339,-0.877307,1.625696,1.339214


In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,-0.420369,-0.435625,-0.399588,-1.177538
2013-01-01,0.34656,-0.166006,0.949853,-0.898606
2013-01-05,-0.133102,1.470813,-0.040805,-1.364109
2013-01-04,-0.263811,1.501158,-0.935076,0.535883
2013-01-06,1.339214,1.625696,-0.877307,0.938339
2013-01-02,3.240153,2.308688,-0.13555,1.315562


# Selection

* .at, .iat, .loc, iloc, .ix

## Getting

In [15]:
df[['A']]

Unnamed: 0,A
2013-01-01,-0.869409
2013-01-02,0.284539
2013-01-03,0.444197
2013-01-04,-2.143197
2013-01-05,-0.777732
2013-01-06,0.413829


In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.869409,1.905556,-0.484491,-0.08267
2013-01-02,0.284539,-1.414238,0.640839,-1.576282
2013-01-03,0.444197,1.420273,0.371024,-0.208128


In [17]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.284539,-1.414238,0.640839,-1.576282
2013-01-03,0.444197,1.420273,0.371024,-0.208128
2013-01-04,-2.143197,-0.665095,1.156739,0.706142


## Selection by Label

In [18]:
df.loc[dates[0]]

A   -0.869409
B    1.905556
C   -0.484491
D   -0.082670
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.869409,1.905556
2013-01-02,0.284539,-1.414238
2013-01-03,0.444197,1.420273
2013-01-04,-2.143197,-0.665095
2013-01-05,-0.777732,-0.989496
2013-01-06,0.413829,0.916455


In [20]:
df.loc['20130101':'20130105', ['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.869409,1.905556
2013-01-02,0.284539,-1.414238
2013-01-03,0.444197,1.420273
2013-01-04,-2.143197,-0.665095
2013-01-05,-0.777732,-0.989496


In [21]:
df.loc['20130103', ['A','B']]

A    0.444197
B    1.420273
Name: 2013-01-03 00:00:00, dtype: float64

In [22]:
df.loc[dates[0], 'A']

-0.86940880971663104

In [23]:
df.at[dates[0], 'A']

-0.86940880971663104

### Selection by Position

In [24]:
df.iloc[3]

A   -2.143197
B   -0.665095
C    1.156739
D    0.706142
Name: 2013-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-2.143197,-0.665095
2013-01-05,-0.777732,-0.989496


In [26]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.284539,-1.414238,0.640839,-1.576282
2013-01-03,0.444197,1.420273,0.371024,-0.208128


In [27]:
df.iloc[1,1]

-1.4142384203519189

In [28]:
df.iat[1,1]  # 위 iloc와 같은 결과를 보여준다. 스칼라에? 좀 더 빠르게 접근한다.

-1.4142384203519189

### Boolean Indexing

In [29]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.284539,-1.414238,0.640839,-1.576282
2013-01-03,0.444197,1.420273,0.371024,-0.208128
2013-01-06,0.413829,0.916455,-0.148217,0.453023


In [30]:
df[df > 0] # 음수값은 NaN으로 출력 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,1.905556,,
2013-01-02,0.284539,,0.640839,
2013-01-03,0.444197,1.420273,0.371024,
2013-01-04,,,1.156739,0.706142
2013-01-05,,,,
2013-01-06,0.413829,0.916455,,0.453023


In [31]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.869409,1.905556,-0.484491,-0.08267,one
2013-01-02,0.284539,-1.414238,0.640839,-1.576282,one
2013-01-03,0.444197,1.420273,0.371024,-0.208128,two
2013-01-04,-2.143197,-0.665095,1.156739,0.706142,three
2013-01-05,-0.777732,-0.989496,-1.568378,-0.134177,four
2013-01-06,0.413829,0.916455,-0.148217,0.453023,three


### setting

In [32]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(df.shape)
s1

(6, 4)


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [33]:
df.at[dates[0],'A'] = 0 # setting values by label

In [34]:
df.iat[0, 1] = 0 # setting values by position
df.shape

(6, 4)

In [35]:
# setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) 
print(df.shape)
df

(6, 4)


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.484491,5
2013-01-02,0.284539,-1.414238,0.640839,5
2013-01-03,0.444197,1.420273,0.371024,5
2013-01-04,-2.143197,-0.665095,1.156739,5
2013-01-05,-0.777732,-0.989496,-1.568378,5
2013-01-06,0.413829,0.916455,-0.148217,5


In [36]:
# A where operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.484491,-5
2013-01-02,-0.284539,-1.414238,-0.640839,-5
2013-01-03,-0.444197,-1.420273,-0.371024,-5
2013-01-04,-2.143197,-0.665095,-1.156739,-5
2013-01-05,-0.777732,-0.989496,-1.568378,-5
2013-01-06,-0.413829,-0.916455,-0.148217,-5


# Missing Data

판다스는 np.nan으로 유실 된 데이터를 표현한다. 그리고 이 데이터는 계산에는 포함되지 않는다.
재인덱싱은 특정 축에 대해 변경/추가/삭제가 가능하다.

In [37]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.484491,5,1.0
2013-01-02,0.284539,-1.414238,0.640839,5,1.0
2013-01-03,0.444197,1.420273,0.371024,5,
2013-01-04,-2.143197,-0.665095,1.156739,5,


In [38]:
# 유실 데이터가 있는 행을 드랍시킨다.
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.484491,5,1.0
2013-01-02,0.284539,-1.414238,0.640839,5,1.0


In [39]:
# 유실 데이터를 채워준다.
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.484491,5,1.0
2013-01-02,0.284539,-1.414238,0.640839,5,1.0
2013-01-03,0.444197,1.420273,0.371024,5,5.0
2013-01-04,-2.143197,-0.665095,1.156739,5,5.0


In [40]:
# null 값 여부를 출력한다.
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Operations
* 연산과 관련 된 정보를 더 보고 싶으면 여기를 참고 [Essential Basic Functionality — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop)


In [41]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.484491,5
2013-01-02,0.284539,-1.414238,0.640839,5
2013-01-03,0.444197,1.420273,0.371024,5
2013-01-04,-2.143197,-0.665095,1.156739,5
2013-01-05,-0.777732,-0.989496,-1.568378,5
2013-01-06,0.413829,0.916455,-0.148217,5


In [42]:
# 연산에서는 일반적으로 유실 데이터를 제외한다.

df.mean()

A   -0.296394
B   -0.122017
C   -0.005414
D    5.000000
dtype: float64

In [43]:
df.mean(1)

2013-01-01    1.128877
2013-01-02    1.127785
2013-01-03    1.808874
2013-01-04    0.837112
2013-01-05    0.416098
2013-01-06    1.545517
Freq: D, dtype: float64

In [44]:
# shift(n)을 하면 n만큼 row의 값이 밀린다.
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [45]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-0.555803,0.420273,-0.628976,4.0
2013-01-04,-5.143197,-3.665095,-1.843261,2.0
2013-01-05,-5.777732,-5.989496,-6.568378,0.0
2013-01-06,,,,


## Apply

In [46]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.484491,5
2013-01-02,0.284539,-1.414238,0.156349,10
2013-01-03,0.728736,0.006035,0.527372,15
2013-01-04,-1.414461,-0.65906,1.684111,20
2013-01-05,-2.192193,-1.648556,0.115733,25
2013-01-06,-1.778363,-0.732101,-0.032485,30


In [47]:
df.apply(lambda x: x.max() - x.min())

A    2.587394
B    2.834512
C    2.725117
D    0.000000
dtype: float64

# Histogramming

In [48]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    3
1    4
2    5
3    2
4    0
5    4
6    0
7    5
8    2
9    6
dtype: int64

In [49]:
s.value_counts()

5    2
4    2
2    2
0    2
6    1
3    1
dtype: int64

## String Methods

* 문자열을 다룬다. 
* 정규표현식을 사용해서 패턴을 찾을 수도 있다.
* [Working with Text Data — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods) 이 링크로 문자열을 벡터로 다루는 법을 볼 수 있다.

In [50]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# Merge
## concat

In [51]:
# 10개의 행과 4개의 컬럼의 랜덤 숫자를 포함하는 데이터프레임을 생성한다.
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-0.611347,-0.420618,0.026158,0.453153
1,1.38753,-0.794787,0.283406,0.452612
2,-1.652419,0.826979,-0.461445,-0.155285
3,-0.225156,0.877108,0.501588,0.775555
4,0.505388,0.428207,-0.692463,-1.055663
5,-0.027698,-0.277274,0.84589,0.165745
6,-0.245062,0.660141,0.504209,-0.226992
7,-0.404829,-0.489419,-0.20147,0.788845
8,-0.421549,-0.055822,-0.321679,-0.210199
9,0.431284,0.268595,2.113373,0.824637


In [52]:
# 3개의 행, 3번 행부터 7번행 전까지, 7번행 이후로 조각을 나눈다.
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -0.611347 -0.420618  0.026158  0.453153
 1  1.387530 -0.794787  0.283406  0.452612
 2 -1.652419  0.826979 -0.461445 -0.155285,
           0         1         2         3
 3 -0.225156  0.877108  0.501588  0.775555
 4  0.505388  0.428207 -0.692463 -1.055663
 5 -0.027698 -0.277274  0.845890  0.165745
 6 -0.245062  0.660141  0.504209 -0.226992,
           0         1         2         3
 7 -0.404829 -0.489419 -0.201470  0.788845
 8 -0.421549 -0.055822 -0.321679 -0.210199
 9  0.431284  0.268595  2.113373  0.824637]

In [53]:
# 다시 조각을 합친다.
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.611347,-0.420618,0.026158,0.453153
1,1.38753,-0.794787,0.283406,0.452612
2,-1.652419,0.826979,-0.461445,-0.155285
3,-0.225156,0.877108,0.501588,0.775555
4,0.505388,0.428207,-0.692463,-1.055663
5,-0.027698,-0.277274,0.84589,0.165745
6,-0.245062,0.660141,0.504209,-0.226992
7,-0.404829,-0.489419,-0.20147,0.788845
8,-0.421549,-0.055822,-0.321679,-0.210199
9,0.431284,0.268595,2.113373,0.824637


## Join
* SQL스타일로 머지하기
* 데이터 베이스 스타일로 머지하는 것은 여기를 참고 [Merge, join, and concatenate — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/merging.html#merging-join) 

In [54]:
left = pd.DataFrame({'key' : ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'foo'], 'rval': [4, 5]})

In [55]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [56]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [57]:
# 키가 모두 foo 인 4행 2열의 데이터프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [58]:
left = pd.DataFrame({'key' : ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key' : ['foo', 'bar'], 'rval': [4, 5]})

In [59]:
# key, lval, rval이 컬럼이 되어 2개 행인 데이터 프레임이 생성된다.
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Append
* 데이터프레임에 행을 추가하기

In [68]:
# 8개의 행과 4개의 컬럼을 갖는 랜덤 숫자를 생성한다. 컬럼의 이름은 A, B, C, D로 지정한다.
df = pd.DataFrame(np.random.randn(8,4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.532992,0.655026,-0.324259,0.305255
1,-0.799331,-0.287096,-1.91358,0.09346
2,-1.915304,0.548429,-0.502718,1.235289
3,0.590066,-1.199935,0.945414,-0.029748
4,1.473446,-0.070274,-0.099457,0.364295
5,-0.181564,-1.04441,-1.544251,1.030215
6,0.441197,0.250389,-0.544008,0.358661
7,-0.167879,0.977562,-1.320787,-1.473966


In [65]:
# index 3의 행을 가져온다.
s = df.iloc[3]
s

A    0.794787
B    0.016004
C    0.676495
D    0.668149
Name: 3, dtype: float64

In [64]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-0.178263,-1.092542,-0.981565,0.883487
1,-0.117106,0.291135,-0.312145,1.051592
2,-0.078411,0.196373,-2.016828,-1.222743
3,0.794787,0.016004,0.676495,0.668149
4,-0.411276,0.186579,0.584244,1.312415
5,-1.047289,-2.31903,0.526137,0.847612
6,2.27066,1.016247,0.779955,-0.141058
7,-0.169808,-0.980033,0.624444,-1.726029
8,0.794787,0.016004,0.676495,0.668149


# Grouping
* splittitng
* Applying
* Combining

In [67]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                         'foo', 'bar', 'foo', 'foo'],
                  'B' : ['one', 'one', 'two', 'three',
                        'two', 'two', 'one', 'three'],
                  'C' : np.random.randn(8),
                  'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.973719,0.900188
1,bar,one,0.771703,-0.730657
2,foo,two,0.969273,-0.193927
3,bar,three,-0.427066,-0.051004
4,foo,two,1.424381,-0.819084
5,bar,two,1.108615,1.529486
6,foo,one,-0.122249,0.532039
7,foo,three,-1.105946,1.306115


In [71]:
# 원래 문서에 있는 groupby와 다른 결과가 나옴 추가 확인이 필요하다.
# A 컬럼의 값들이 groupby 되어 'foo','bar'에 대한 groupby 값이 나와야 하는데 그렇지 않다.
df.groupby('A').sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.915304,0.548429,-0.502718,1.235289
-0.799331,-0.287096,-1.91358,0.09346
-0.532992,0.655026,-0.324259,0.305255
-0.181564,-1.04441,-1.544251,1.030215
-0.167879,0.977562,-1.320787,-1.473966
0.441197,0.250389,-0.544008,0.358661
0.590066,-1.199935,0.945414,-0.029748
1.473446,-0.070274,-0.099457,0.364295


In [73]:
# B열의 one, two, three가 groupby 되어야 하는데 그렇지 못한 결과가 나온다.
# 원래 문서에 있는 결과와 값이 다른 문제 확인필요

df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.915304,0.548429,-0.502718,1.235289
-0.799331,-0.287096,-1.91358,0.09346
-0.532992,0.655026,-0.324259,0.305255
-0.181564,-1.04441,-1.544251,1.030215
-0.167879,0.977562,-1.320787,-1.473966
0.441197,0.250389,-0.544008,0.358661
0.590066,-1.199935,0.945414,-0.029748
1.473446,-0.070274,-0.099457,0.364295
