# Pandas

**Pandas**는 **Python** 프로그래밍 언어를 위한 고성능의 사용하기 쉬운 데이터 구조 및 데이터 분석 도구를 제공하는 오픈 소스, BSD 라이선스 라이브러리입니다.

라이브러리 설명서 : http://pandas.pydata.org/

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# for plotting directly
%matplotlib inline

In [7]:
# create a series
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [10]:
# create a data frame
dates = pd.date_range('2019-01-01',periods=6)
df = pd.DataFrame(data=np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-01-01,0.151453,-2.918426,0.929502,-0.285072
2019-01-02,0.973595,-1.119139,-0.95973,1.653844
2019-01-03,-1.309014,2.7474,2.377735,-0.713987
2019-01-04,2.651046,2.339453,-0.617426,0.438569
2019-01-05,0.399248,1.215407,-0.650163,-0.124817
2019-01-06,0.43085,0.288257,0.227314,0.188698


In [12]:
# another way to create a data frame
df2 = pd.DataFrame(
    { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1,index=range(4), dtype='float32'),
      'D' : np.array([3] * 4,dtype='int32'),
      'E' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E
0,1.0,2013-01-02,1.0,3,foo
1,1.0,2013-01-02,1.0,3,foo
2,1.0,2013-01-02,1.0,3,foo
3,1.0,2013-01-02,1.0,3,foo


In [13]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

In [14]:
df.head(3)

Unnamed: 0,A,B,C,D
2019-01-01,0.151453,-2.918426,0.929502,-0.285072
2019-01-02,0.973595,-1.119139,-0.95973,1.653844
2019-01-03,-1.309014,2.7474,2.377735,-0.713987


In [15]:
df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
df.values

array([[ 0.15145313, -2.91842608,  0.92950154, -0.28507182],
       [ 0.97359501, -1.11913882, -0.95972956,  1.65384372],
       [-1.30901426,  2.74740015,  2.3777346 , -0.71398745],
       [ 2.65104551,  2.33945265, -0.61742561,  0.43856944],
       [ 0.3992476 ,  1.215407  , -0.6501632 , -0.12481665],
       [ 0.43084993,  0.28825673,  0.22731384,  0.18869758]])

In [18]:
df.describe()    # quick data summary

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.549529,0.425492,0.217872,0.192872
std,1.284174,2.15708,1.265659,0.818117
min,-1.309014,-2.918426,-0.95973,-0.713987
25%,0.213402,-0.76729,-0.641979,-0.245008
50%,0.415049,0.751832,-0.195056,0.03194
75%,0.837909,2.058441,0.753955,0.376101
max,2.651046,2.7474,2.377735,1.653844


In [19]:
df.T

Unnamed: 0,2019-01-01 00:00:00,2019-01-02 00:00:00,2019-01-03 00:00:00,2019-01-04 00:00:00,2019-01-05 00:00:00,2019-01-06 00:00:00
A,0.151453,0.973595,-1.309014,2.651046,0.399248,0.43085
B,-2.918426,-1.119139,2.7474,2.339453,1.215407,0.288257
C,0.929502,-0.95973,2.377735,-0.617426,-0.650163,0.227314
D,-0.285072,1.653844,-0.713987,0.438569,-0.124817,0.188698


In [20]:
df.sort_index(axis=1, ascending=False)   # axis 0 is index, axis 1 is column

Unnamed: 0,D,C,B,A
2019-01-01,-0.285072,0.929502,-2.918426,0.151453
2019-01-02,1.653844,-0.95973,-1.119139,0.973595
2019-01-03,-0.713987,2.377735,2.7474,-1.309014
2019-01-04,0.438569,-0.617426,2.339453,2.651046
2019-01-05,-0.124817,-0.650163,1.215407,0.399248
2019-01-06,0.188698,0.227314,0.288257,0.43085


In [21]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2019-01-01,0.151453,-2.918426,0.929502,-0.285072
2019-01-02,0.973595,-1.119139,-0.95973,1.653844
2019-01-06,0.43085,0.288257,0.227314,0.188698
2019-01-05,0.399248,1.215407,-0.650163,-0.124817
2019-01-04,2.651046,2.339453,-0.617426,0.438569
2019-01-03,-1.309014,2.7474,2.377735,-0.713987


## Selection

In [22]:
df['A']   # select a column(yields a series)

2019-01-01    0.151453
2019-01-02    0.973595
2019-01-03   -1.309014
2019-01-04    2.651046
2019-01-05    0.399248
2019-01-06    0.430850
Freq: D, Name: A, dtype: float64

In [23]:
df.A   # column names also attached to the object

2019-01-01    0.151453
2019-01-02    0.973595
2019-01-03   -1.309014
2019-01-04    2.651046
2019-01-05    0.399248
2019-01-06    0.430850
Freq: D, Name: A, dtype: float64

In [24]:
df[0:3]   # slicing works

Unnamed: 0,A,B,C,D
2019-01-01,0.151453,-2.918426,0.929502,-0.285072
2019-01-02,0.973595,-1.119139,-0.95973,1.653844
2019-01-03,-1.309014,2.7474,2.377735,-0.713987


In [23]:
df['20180102':'20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-03,1.506024,-0.240667,0.832338,1.584231
2018-01-04,-0.684728,-0.052251,0.13703,0.741477


In [24]:
df.loc[dates[0]]   # cross-section using a label

A   -0.685629
B    0.254586
C    0.536291
D   -1.193469
Name: 2018-01-01 00:00:00, dtype: float64

In [25]:
df.loc[dates[0], 'A'] # getting a scalar value

-0.68562905803810437

In [26]:
df.iloc[3]   # seelct via position

A   -0.684728
B   -0.052251
C    0.137030
D    0.741477
Name: 2018-01-04 00:00:00, dtype: float64

In [27]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,-0.684728,-0.052251
2018-01-05,-0.059814,-0.582666


In [28]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2018-01-03,1.506024,-0.240667,0.832338,1.584231


In [29]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,,0.254586,0.536291,
2018-01-02,,,0.19403,0.173812
2018-01-03,1.506024,,0.832338,1.584231
2018-01-04,,,0.13703,0.741477
2018-01-05,,,,0.876118
2018-01-06,,0.293328,,


In [26]:
df3 = df.copy()
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df3[df3['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2019-01-03,-1.309014,2.7474,2.377735,5,two
2019-01-05,0.399248,1.215407,-0.650163,5,four


In [25]:
# setting examples
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2019-01-01,0.0,0.0,0.929502,5
2019-01-02,0.973595,-1.119139,-0.95973,5
2019-01-03,-1.309014,2.7474,2.377735,5
2019-01-04,2.651046,2.339453,-0.617426,5
2019-01-05,0.399248,1.215407,-0.650163,5
2019-01-06,0.43085,0.288257,0.227314,5
