# [10 Minutes to pandas — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/10min.html)


10분간 따라할 수 있는 판다스 튜토리얼 이지만 실제로는 1~2시간이 걸린다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan, 6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [4]:
df2 = pd.DataFrame({ 'A' : 1.,
                'B' : pd.Timestamp('20130102'),
                'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                'D' : np.array([3] * 4,dtype='int32'),
                'E' : pd.Categorical(["test","train","test","train"]),
                'F' : 'foo' })


In [5]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [6]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.155555,0.114845,-0.008823,-1.952056
2013-01-02,-0.948317,0.584972,1.075063,0.117811
2013-01-03,-0.599491,0.033804,0.866711,1.239697
2013-01-04,-1.42117,-1.560034,-0.834652,0.443411
2013-01-05,0.81881,-0.617009,0.588357,-0.636878


In [7]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.42117,-1.560034,-0.834652,0.443411
2013-01-05,0.81881,-0.617009,0.588357,-0.636878
2013-01-06,0.71765,-0.21464,1.341061,-2.686679


In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
df.values

array([[-1.15555521,  0.11484483, -0.00882342, -1.95205618],
       [-0.94831674,  0.58497175,  1.07506344,  0.11781057],
       [-0.59949089,  0.03380418,  0.86671102,  1.23969742],
       [-1.42116966, -1.56003413, -0.83465223,  0.44341082],
       [ 0.81880967, -0.61700904,  0.5883575 , -0.63687781],
       [ 0.71764954, -0.21464036,  1.34106073, -2.68667948]])

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.431346,-0.276344,0.50462,-0.579116
std,0.967765,0.742806,0.80255,1.494622
min,-1.42117,-1.560034,-0.834652,-2.686679
25%,-1.103746,-0.516417,0.140472,-1.623262
50%,-0.773904,-0.090418,0.727534,-0.259534
75%,0.388364,0.094585,1.022975,0.362011
max,0.81881,0.584972,1.341061,1.239697


In [12]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.155555,-0.948317,-0.599491,-1.42117,0.81881,0.71765
B,0.114845,0.584972,0.033804,-1.560034,-0.617009,-0.21464
C,-0.008823,1.075063,0.866711,-0.834652,0.588357,1.341061
D,-1.952056,0.117811,1.239697,0.443411,-0.636878,-2.686679


In [13]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.952056,-0.008823,0.114845,-1.155555
2013-01-02,0.117811,1.075063,0.584972,-0.948317
2013-01-03,1.239697,0.866711,0.033804,-0.599491
2013-01-04,0.443411,-0.834652,-1.560034,-1.42117
2013-01-05,-0.636878,0.588357,-0.617009,0.81881
2013-01-06,-2.686679,1.341061,-0.21464,0.71765


In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-04,-1.42117,-1.560034,-0.834652,0.443411
2013-01-05,0.81881,-0.617009,0.588357,-0.636878
2013-01-06,0.71765,-0.21464,1.341061,-2.686679
2013-01-03,-0.599491,0.033804,0.866711,1.239697
2013-01-01,-1.155555,0.114845,-0.008823,-1.952056
2013-01-02,-0.948317,0.584972,1.075063,0.117811


# Selection

* .at, .iat, .loc, iloc, .ix

## Getting

In [15]:
df[['A']]

Unnamed: 0,A
2013-01-01,-1.155555
2013-01-02,-0.948317
2013-01-03,-0.599491
2013-01-04,-1.42117
2013-01-05,0.81881
2013-01-06,0.71765


In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.155555,0.114845,-0.008823,-1.952056
2013-01-02,-0.948317,0.584972,1.075063,0.117811
2013-01-03,-0.599491,0.033804,0.866711,1.239697


In [17]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.948317,0.584972,1.075063,0.117811
2013-01-03,-0.599491,0.033804,0.866711,1.239697
2013-01-04,-1.42117,-1.560034,-0.834652,0.443411


## Selection by Label

In [18]:
df.loc[dates[0]]

A   -1.155555
B    0.114845
C   -0.008823
D   -1.952056
Name: 2013-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:, ['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.155555,0.114845
2013-01-02,-0.948317,0.584972
2013-01-03,-0.599491,0.033804
2013-01-04,-1.42117,-1.560034
2013-01-05,0.81881,-0.617009
2013-01-06,0.71765,-0.21464


In [20]:
df.loc['20130101':'20130105', ['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.155555,0.114845
2013-01-02,-0.948317,0.584972
2013-01-03,-0.599491,0.033804
2013-01-04,-1.42117,-1.560034
2013-01-05,0.81881,-0.617009


In [21]:
df.loc['20130103', ['A','B']]

A   -0.599491
B    0.033804
Name: 2013-01-03 00:00:00, dtype: float64

In [22]:
df.loc[dates[0], 'A']

-1.1555552136011324

In [23]:
df.at[dates[0], 'A']

-1.1555552136011324

### Selection by Position

In [24]:
df.iloc[3]

A   -1.421170
B   -1.560034
C   -0.834652
D    0.443411
Name: 2013-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.42117,-1.560034
2013-01-05,0.81881,-0.617009


In [26]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.948317,0.584972,1.075063,0.117811
2013-01-03,-0.599491,0.033804,0.866711,1.239697


In [27]:
df.iloc[1,1]

0.58497175380349864

In [28]:
df.iat[1,1]  # 위 iloc와 같은 결과를 보여준다. 스칼라에? 좀 더 빠르게 접근한다.

0.58497175380349864

### Boolean Indexing

In [29]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-05,0.81881,-0.617009,0.588357,-0.636878
2013-01-06,0.71765,-0.21464,1.341061,-2.686679


In [30]:
df[df > 0] # 음수값은 NaN으로 출력 된다.

Unnamed: 0,A,B,C,D
2013-01-01,,0.114845,,
2013-01-02,,0.584972,1.075063,0.117811
2013-01-03,,0.033804,0.866711,1.239697
2013-01-04,,,,0.443411
2013-01-05,0.81881,,0.588357,
2013-01-06,0.71765,,1.341061,


In [31]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.155555,0.114845,-0.008823,-1.952056,one
2013-01-02,-0.948317,0.584972,1.075063,0.117811,one
2013-01-03,-0.599491,0.033804,0.866711,1.239697,two
2013-01-04,-1.42117,-1.560034,-0.834652,0.443411,three
2013-01-05,0.81881,-0.617009,0.588357,-0.636878,four
2013-01-06,0.71765,-0.21464,1.341061,-2.686679,three


### setting

In [32]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
print(df.shape)
s1

(6, 4)


2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [33]:
df.at[dates[0],'A'] = 0 # setting values by label

In [34]:
df.iat[0, 1] = 0 # setting values by position
df.shape

(6, 4)

In [35]:
# setting by assigning with a numpy array
df.loc[:, 'D'] = np.array([5] * len(df)) 
print(df.shape)
df

(6, 4)


Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.008823,5
2013-01-02,-0.948317,0.584972,1.075063,5
2013-01-03,-0.599491,0.033804,0.866711,5
2013-01-04,-1.42117,-1.560034,-0.834652,5
2013-01-05,0.81881,-0.617009,0.588357,5
2013-01-06,0.71765,-0.21464,1.341061,5


In [36]:
# A where operation with setting
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.008823,-5
2013-01-02,-0.948317,-0.584972,-1.075063,-5
2013-01-03,-0.599491,-0.033804,-0.866711,-5
2013-01-04,-1.42117,-1.560034,-0.834652,-5
2013-01-05,-0.81881,-0.617009,-0.588357,-5
2013-01-06,-0.71765,-0.21464,-1.341061,-5


# Missing Data

판다스는 np.nan으로 유실 된 데이터를 표현한다. 그리고 이 데이터는 계산에는 포함되지 않는다.
재인덱싱은 특정 축에 대해 변경/추가/삭제가 가능하다.

In [37]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.008823,5,1.0
2013-01-02,-0.948317,0.584972,1.075063,5,1.0
2013-01-03,-0.599491,0.033804,0.866711,5,
2013-01-04,-1.42117,-1.560034,-0.834652,5,


In [38]:
# 유실 데이터가 있는 행을 드랍시킨다.
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.008823,5,1.0
2013-01-02,-0.948317,0.584972,1.075063,5,1.0


In [39]:
# 유실 데이터를 채워준다.
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.008823,5,1.0
2013-01-02,-0.948317,0.584972,1.075063,5,1.0
2013-01-03,-0.599491,0.033804,0.866711,5,5.0
2013-01-04,-1.42117,-1.560034,-0.834652,5,5.0


In [40]:
# null 값 여부를 출력한다.
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# Operations
* 연산과 관련 된 정보를 더 보고 싶으면 여기를 참고 [Essential Basic Functionality — pandas 0.20.3 documentation](https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop)


In [41]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.008823,5
2013-01-02,-0.948317,0.584972,1.075063,5
2013-01-03,-0.599491,0.033804,0.866711,5
2013-01-04,-1.42117,-1.560034,-0.834652,5
2013-01-05,0.81881,-0.617009,0.588357,5
2013-01-06,0.71765,-0.21464,1.341061,5


In [42]:
# 연산에서는 일반적으로 유실 데이터를 제외한다.

df.mean()

A   -0.238753
B   -0.295485
C    0.504620
D    5.000000
dtype: float64

In [45]:
df.mean(1)

2013-01-01    1.247794
2013-01-02    1.427930
2013-01-03    1.325256
2013-01-04    0.296036
2013-01-05    1.447540
2013-01-06    1.711017
Freq: D, dtype: float64

In [47]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64