<a href="https://colab.research.google.com/github/hanna-joo/bigdata_edu/blob/master/python_edu/pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

# 1. 변수 생성 (Object creation)

In [None]:
# Series 생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
# DataFrame 생성: Numpy 행렬
dates = pd.date_range('20130101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-05,1.127538,-0.86188,1.026014,1.636723
2013-01-06,-1.449656,2.067481,1.412776,-0.638622


In [None]:
# DataFrame 생성: 딕셔너리
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# 2. 데이터 확인 (Viewing data)


In [None]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951


In [None]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
# Numpy 행렬에는 단 한가지의 dtype만 가능
# Pandas 데이터 프레임에는 컬럼당 한가지의 dtype만 가능
df.to_numpy()

array([[ 0.68037358,  0.51737233, -1.67186508, -0.97482208],
       [ 1.01665214, -1.27462752, -0.97708234, -0.73195097],
       [ 0.06194247, -0.76211858, -1.45483628,  0.45725156],
       [-0.71776378,  1.75401809, -0.32298288,  0.13265534],
       [ 1.12753845, -0.86188043,  1.0260142 ,  1.63672324],
       [-1.44965578,  2.06748096,  1.41277608, -0.63862213]])

In [None]:
# 여러 dtype이 존재하는 df2를 numpy로 변환 시 모든 것을 아우르는 dtype 선택
  # DataFrame.to_numpy(): 인덱스와 컬럼명은 포함되지 않음
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [None]:
# df의 통계 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.119848,0.240041,-0.331329,-0.019794
std,1.029774,1.42978,1.293109,0.979495
min,-1.449656,-1.274628,-1.671865,-0.974822
25%,-0.522837,-0.83694,-1.335398,-0.708619
50%,0.371158,-0.122373,-0.650033,-0.252983
75%,0.932583,1.444857,0.688765,0.376103
max,1.127538,2.067481,1.412776,1.636723


In [None]:
# df의 행과 열 바꾸기
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.680374,1.016652,0.061942,-0.717764,1.127538,-1.449656
B,0.517372,-1.274628,-0.762119,1.754018,-0.86188,2.067481
C,-1.671865,-0.977082,-1.454836,-0.322983,1.026014,1.412776
D,-0.974822,-0.731951,0.457252,0.132655,1.636723,-0.638622


In [None]:
# 행 또는 열 값에 따라 나열하기
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.974822,-1.671865,0.517372,0.680374
2013-01-02,-0.731951,-0.977082,-1.274628,1.016652
2013-01-03,0.457252,-1.454836,-0.762119,0.061942
2013-01-04,0.132655,-0.322983,1.754018,-0.717764
2013-01-05,1.636723,1.026014,-0.86188,1.127538
2013-01-06,-0.638622,1.412776,2.067481,-1.449656


In [None]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.449656,2.067481,1.412776,-0.638622
2013-01-05,1.127538,-0.86188,1.026014,1.636723
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-01,0.680374,0.517372,-1.671865,-0.974822


In [None]:
# 값에 따라 나열하기
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-05,1.127538,-0.86188,1.026014,1.636723
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-06,-1.449656,2.067481,1.412776,-0.638622


In [None]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.449656,2.067481,1.412776,-0.638622
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-05,1.127538,-0.86188,1.026014,1.636723
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951


# 3. 데이터 선택 (Selection)
- .at / .iat
- .loc / .iloc
- .isin

In [None]:
# 해당하는 열 가져오기
df['A']  # df.A

2013-01-01    0.680374
2013-01-02    1.016652
2013-01-03    0.061942
2013-01-04   -0.717764
2013-01-05    1.127538
2013-01-06   -1.449656
Freq: D, Name: A, dtype: float64

In [None]:
# 해당하는 행 가져오기
df[2:4]  # df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-04,-0.717764,1.754018,-0.322983,0.132655


## 인덱스 또는 컬럼명으로 가져오기 (.loc)

In [None]:
# 인덱스 값으로 가져오기
df.loc['20130101']

A    0.680374
B    0.517372
C   -1.671865
D   -0.974822
Name: 2013-01-01 00:00:00, dtype: float64

In [None]:
# 지정한 행과 열에 해당하는 값 가져오기
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.680374,0.517372
2013-01-02,1.016652,-1.274628
2013-01-03,0.061942,-0.762119
2013-01-04,-0.717764,1.754018
2013-01-05,1.127538,-0.86188
2013-01-06,-1.449656,2.067481


In [None]:
df.loc['20130102':'20130104', 'A':'C']

Unnamed: 0,A,B,C
2013-01-02,1.016652,-1.274628,-0.977082
2013-01-03,0.061942,-0.762119,-1.454836
2013-01-04,-0.717764,1.754018,-0.322983


In [None]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,1.016652,-0.977082
2013-01-03,0.061942,-1.454836
2013-01-04,-0.717764,-0.322983


In [None]:
df.loc[dates[0],'A']

0.6803735791475444

In [None]:
# .at: scalar 가져올 때 .loc 보다 더 빠른 방법
df.at[dates[0],'A']

0.6803735791475444

## 좌표값으로 가져오기 (.iloc)

In [None]:
df.iloc[2]

A    0.061942
B   -0.762119
C   -1.454836
D    0.457252
Name: 2013-01-03 00:00:00, dtype: float64

In [None]:
df.iloc[3:5]  # df[3:5] / df['20130104':'20130105']

Unnamed: 0,A,B,C,D
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-05,1.127538,-0.86188,1.026014,1.636723


In [None]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.717764,1.754018
2013-01-05,1.127538,-0.86188


In [None]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.016652,-0.977082
2013-01-03,0.061942,-1.454836
2013-01-05,1.127538,1.026014


In [None]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-03,0.061942,-0.762119,-1.454836,0.457252


In [None]:
df.iloc[1,1]

-1.2746275156187237

In [None]:
# .iat: scalar 가져올 때 .iloc 보다 더 빠른 방법
df.iat[1,1]

-1.2746275156187237

## 조건에 해당하는 값 가져오기 (.isin)

In [None]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-05,1.127538,-0.86188,1.026014,1.636723


In [None]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.680374,0.517372,,
2013-01-02,1.016652,,,
2013-01-03,0.061942,,,0.457252
2013-01-04,,1.754018,,0.132655
2013-01-05,1.127538,,1.026014,1.636723
2013-01-06,,2.067481,1.412776,


In [None]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.680374,0.517372,-1.671865,-0.974822,one
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951,one
2013-01-03,0.061942,-0.762119,-1.454836,0.457252,two
2013-01-04,-0.717764,1.754018,-0.322983,0.132655,three
2013-01-05,1.127538,-0.86188,1.026014,1.636723,four
2013-01-06,-1.449656,2.067481,1.412776,-0.638622,three


In [None]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.061942,-0.762119,-1.454836,0.457252,two
2013-01-05,1.127538,-0.86188,1.026014,1.636723,four


## 값 설정하기

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [None]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.680374,0.517372,-1.671865,-0.974822
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951
2013-01-03,0.061942,-0.762119,-1.454836,0.457252
2013-01-04,-0.717764,1.754018,-0.322983,0.132655
2013-01-05,1.127538,-0.86188,1.026014,1.636723
2013-01-06,-1.449656,2.067481,1.412776,-0.638622


In [None]:
# 새로운 컬럼 추가(인덱스에 해당하는 값만 가져옴)
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.680374,0.517372,-1.671865,-0.974822,
2013-01-02,1.016652,-1.274628,-0.977082,-0.731951,1.0
2013-01-03,0.061942,-0.762119,-1.454836,0.457252,2.0
2013-01-04,-0.717764,1.754018,-0.322983,0.132655,3.0
2013-01-05,1.127538,-0.86188,1.026014,1.636723,4.0
2013-01-06,-1.449656,2.067481,1.412776,-0.638622,5.0


In [None]:
# 새로운 값으로 바꾸기
df.at[dates[0],'A'] = 0
df.iat[0, 1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671865,5,
2013-01-02,1.016652,-1.274628,-0.977082,5,1.0
2013-01-03,0.061942,-0.762119,-1.454836,5,2.0
2013-01-04,-0.717764,1.754018,-0.322983,5,3.0
2013-01-05,1.127538,-0.86188,1.026014,5,4.0
2013-01-06,-1.449656,2.067481,1.412776,5,5.0


In [None]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-1.671865,-5,
2013-01-02,-1.016652,-1.274628,-0.977082,-5,-1.0
2013-01-03,-0.061942,-0.762119,-1.454836,-5,-2.0
2013-01-04,-0.717764,-1.754018,-0.322983,-5,-3.0
2013-01-05,-1.127538,-0.86188,-1.026014,-5,-4.0
2013-01-06,-1.449656,-2.067481,-1.412776,-5,-5.0
