# Pandas

**Pandas**는 **Python** 프로그래밍 언어를 위한 고성능의 사용하기 쉬운 데이터 구조 및 데이터 분석 도구를 제공하는 오픈 소스, BSD 라이선스 라이브러리입니다.

라이브러리 설명서 : http://pandas.pydata.org/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline   # for plotting directly

In [3]:
# create a series
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# create a data frame
dates = pd.date_range('20180101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-01-01,-0.685629,0.254586,0.536291,-1.193469
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-03,1.506024,-0.240667,0.832338,1.584231
2018-01-04,-0.684728,-0.052251,0.13703,0.741477
2018-01-05,-0.059814,-0.582666,-0.151178,0.876118
2018-01-06,-0.497567,0.293328,-0.805949,-0.285735


In [5]:
# another way to create a data frame
df2 = pd.DataFrame(
    { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
      'D' : np.array([3] * 4,dtype='int32'),
      'E' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E
0,1.0,2013-01-02,1.0,3,foo
1,1.0,2013-01-02,1.0,3,foo
2,1.0,2013-01-02,1.0,3,foo
3,1.0,2013-01-02,1.0,3,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

In [8]:
df.head(3)

Unnamed: 0,A,B,C,D
2018-01-01,-0.685629,0.254586,0.536291,-1.193469
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-03,1.506024,-0.240667,0.832338,1.584231


In [9]:
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.values

array([[-0.68562906,  0.25458647,  0.53629066, -1.1934687 ],
       [-1.23373303, -1.34811626,  0.19402974,  0.17381173],
       [ 1.50602375, -0.24066687,  0.83233798,  1.58423149],
       [-0.68472835, -0.05225102,  0.13702973,  0.74147707],
       [-0.05981358, -0.58266628, -0.15117786,  0.87611824],
       [-0.4975672 ,  0.29332792, -0.80594904, -0.28573485]])

In [12]:
df.describe()    # quick data summary

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.275908,-0.279298,0.12376,0.316072
std,0.951162,0.616503,0.56874,0.976112
min,-1.233733,-1.348116,-0.805949,-1.193469
25%,-0.685404,-0.497166,-0.079126,-0.170848
50%,-0.591148,-0.146459,0.16553,0.457644
75%,-0.169252,0.177877,0.450725,0.842458
max,1.506024,0.293328,0.832338,1.584231


In [13]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,-0.685629,-1.233733,1.506024,-0.684728,-0.059814,-0.497567
B,0.254586,-1.348116,-0.240667,-0.052251,-0.582666,0.293328
C,0.536291,0.19403,0.832338,0.13703,-0.151178,-0.805949
D,-1.193469,0.173812,1.584231,0.741477,0.876118,-0.285735


In [14]:
df.sort_index(axis=1, ascending=False)   # axis 0 is index, axis 1 is column

Unnamed: 0,D,C,B,A
2018-01-01,-1.193469,0.536291,0.254586,-0.685629
2018-01-02,0.173812,0.19403,-1.348116,-1.233733
2018-01-03,1.584231,0.832338,-0.240667,1.506024
2018-01-04,0.741477,0.13703,-0.052251,-0.684728
2018-01-05,0.876118,-0.151178,-0.582666,-0.059814
2018-01-06,-0.285735,-0.805949,0.293328,-0.497567


In [19]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-05,-0.059814,-0.582666,-0.151178,0.876118
2018-01-03,1.506024,-0.240667,0.832338,1.584231
2018-01-04,-0.684728,-0.052251,0.13703,0.741477
2018-01-01,-0.685629,0.254586,0.536291,-1.193469
2018-01-06,-0.497567,0.293328,-0.805949,-0.285735


## Selection

In [20]:
df['A']   # select a column(yields a series)

2018-01-01   -0.685629
2018-01-02   -1.233733
2018-01-03    1.506024
2018-01-04   -0.684728
2018-01-05   -0.059814
2018-01-06   -0.497567
Freq: D, Name: A, dtype: float64

In [21]:
df.A   # column names also attached to the object

2018-01-01   -0.685629
2018-01-02   -1.233733
2018-01-03    1.506024
2018-01-04   -0.684728
2018-01-05   -0.059814
2018-01-06   -0.497567
Freq: D, Name: A, dtype: float64

In [22]:
df[0:3]   # slicing works

Unnamed: 0,A,B,C,D
2018-01-01,-0.685629,0.254586,0.536291,-1.193469
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-03,1.506024,-0.240667,0.832338,1.584231


In [23]:
df['20180102':'20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-1.233733,-1.348116,0.19403,0.173812
2018-01-03,1.506024,-0.240667,0.832338,1.584231
2018-01-04,-0.684728,-0.052251,0.13703,0.741477


In [24]:
df.loc[dates[0]]   # cross-section using a label

A   -0.685629
B    0.254586
C    0.536291
D   -1.193469
Name: 2018-01-01 00:00:00, dtype: float64

In [25]:
df.loc[dates[0], 'A'] # getting a scalar value

-0.68562905803810437

In [26]:
df.iloc[3]   # seelct via position

A   -0.684728
B   -0.052251
C    0.137030
D    0.741477
Name: 2018-01-04 00:00:00, dtype: float64

In [27]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,-0.684728,-0.052251
2018-01-05,-0.059814,-0.582666


In [28]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2018-01-03,1.506024,-0.240667,0.832338,1.584231


In [29]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,,0.254586,0.536291,
2018-01-02,,,0.19403,0.173812
2018-01-03,1.506024,,0.832338,1.584231
2018-01-04,,,0.13703,0.741477
2018-01-05,,,,0.876118
2018-01-06,,0.293328,,


In [32]:
df3 = df.copy()
df3['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df3[df3['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2018-01-03,1.506024,-0.240667,0.832338,1.584231,two
2018-01-05,-0.059814,-0.582666,-0.151178,0.876118,four


In [33]:
# setting examples
df.at[dates[0],'A'] = 0
df.iat[0,1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2018-01-01,0.0,0.0,0.536291,5
2018-01-02,-1.233733,-1.348116,0.19403,5
2018-01-03,1.506024,-0.240667,0.832338,5
2018-01-04,-0.684728,-0.052251,0.13703,5
2018-01-05,-0.059814,-0.582666,-0.151178,5
2018-01-06,-0.497567,0.293328,-0.805949,5
