<a href="https://colab.research.google.com/github/hanna-joo/bigdata_edu/blob/master/python_edu/pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

# 1. 변수 생성 (Object creation)

In [5]:
# Series 생성
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
# DataFrame 생성: Numpy 행렬
dates = pd.date_range('20130101',periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2013-01-01,1.358469,0.216869,0.278032,-0.318681
2013-01-02,-2.764424,1.71949,0.216518,-0.373867
2013-01-03,0.978068,0.620083,1.750997,-1.770599
2013-01-04,-0.216042,-0.495686,2.036863,0.189982
2013-01-05,0.021969,0.153268,0.160588,1.686423
2013-01-06,-0.003046,-0.469175,-1.170794,0.523317


In [11]:
# DataFrame 생성: 딕셔너리
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3]*4, dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})
print(df2.dtypes)
df2

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


# 2. 데이터 확인 (Viewing data)


In [13]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,1.358469,0.216869,0.278032,-0.318681
2013-01-02,-2.764424,1.71949,0.216518,-0.373867


In [14]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
# Numpy 행렬에는 단 한가지의 dtype만 가능
# Pandas 데이터 프레임에는 컬럼당 한가지의 dtype만 가능
df.to_numpy()

array([[ 1.35846864,  0.21686859,  0.27803227, -0.31868095],
       [-2.76442368,  1.71949047,  0.21651831, -0.37386673],
       [ 0.97806835,  0.62008295,  1.75099665, -1.77059919],
       [-0.21604154, -0.49568584,  2.03686259,  0.18998239],
       [ 0.02196877,  0.15326825,  0.16058841,  1.68642326],
       [-0.00304588, -0.46917508, -1.17079435,  0.52331701]])

In [18]:
# 여러 dtype이 존재하는 df2를 numpy로 변환 시 모든 것을 아우르는 dtype 선택
  # DataFrame.to_numpy(): 인덱스와 컬럼명은 포함되지 않음
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [20]:
# df의 통계 요약
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.104168,0.290808,0.545367,-0.010571
std,1.444033,0.820986,1.179065,1.142774
min,-2.764424,-0.495686,-1.170794,-1.770599
25%,-0.162793,-0.313564,0.174571,-0.36007
50%,0.009461,0.185068,0.247275,-0.064349
75%,0.739043,0.519279,1.382756,0.439983
max,1.358469,1.71949,2.036863,1.686423


In [21]:
# df의 행과 열 바꾸기
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.358469,-2.764424,0.978068,-0.216042,0.021969,-0.003046
B,0.216869,1.71949,0.620083,-0.495686,0.153268,-0.469175
C,0.278032,0.216518,1.750997,2.036863,0.160588,-1.170794
D,-0.318681,-0.373867,-1.770599,0.189982,1.686423,0.523317


In [22]:
# 행 또는 열 값에 따라 나열하기
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.318681,0.278032,0.216869,1.358469
2013-01-02,-0.373867,0.216518,1.71949,-2.764424
2013-01-03,-1.770599,1.750997,0.620083,0.978068
2013-01-04,0.189982,2.036863,-0.495686,-0.216042
2013-01-05,1.686423,0.160588,0.153268,0.021969
2013-01-06,0.523317,-1.170794,-0.469175,-0.003046


In [25]:
df.sort_index(ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-0.003046,-0.469175,-1.170794,0.523317
2013-01-05,0.021969,0.153268,0.160588,1.686423
2013-01-04,-0.216042,-0.495686,2.036863,0.189982
2013-01-03,0.978068,0.620083,1.750997,-1.770599
2013-01-02,-2.764424,1.71949,0.216518,-0.373867
2013-01-01,1.358469,0.216869,0.278032,-0.318681


In [27]:
# 값에 따라 나열하기
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-04,-0.216042,-0.495686,2.036863,0.189982
2013-01-06,-0.003046,-0.469175,-1.170794,0.523317
2013-01-05,0.021969,0.153268,0.160588,1.686423
2013-01-01,1.358469,0.216869,0.278032,-0.318681
2013-01-03,0.978068,0.620083,1.750997,-1.770599
2013-01-02,-2.764424,1.71949,0.216518,-0.373867


In [28]:
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-02,-2.764424,1.71949,0.216518,-0.373867
2013-01-03,0.978068,0.620083,1.750997,-1.770599
2013-01-01,1.358469,0.216869,0.278032,-0.318681
2013-01-05,0.021969,0.153268,0.160588,1.686423
2013-01-06,-0.003046,-0.469175,-1.170794,0.523317
2013-01-04,-0.216042,-0.495686,2.036863,0.189982


# 3. 데이터 선택 (Selection)
- .at / .iat
- .loc / .iloc
- .isin

In [29]:
# 해당하는 열 가져오기
df['A']  # df.A

2013-01-01    1.358469
2013-01-02   -2.764424
2013-01-03    0.978068
2013-01-04   -0.216042
2013-01-05    0.021969
2013-01-06   -0.003046
Freq: D, Name: A, dtype: float64

In [31]:
# 해당하는 행 가져오기
df[2:4]  # df['20130103':'20130104']

Unnamed: 0,A,B,C,D
2013-01-03,0.978068,0.620083,1.750997,-1.770599
2013-01-04,-0.216042,-0.495686,2.036863,0.189982


## 인덱스 또는 컬럼명으로 가져오기

In [33]:
# 인덱스 값으로 가져오기
df.loc['20130101']

A    1.358469
B    0.216869
C    0.278032
D   -0.318681
Name: 2013-01-01 00:00:00, dtype: float64

In [35]:
# 지정한 행과 열에 해당하는 값 가져오기
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,1.358469,0.216869
2013-01-02,-2.764424,1.71949
2013-01-03,0.978068,0.620083
2013-01-04,-0.216042,-0.495686
2013-01-05,0.021969,0.153268
2013-01-06,-0.003046,-0.469175


In [40]:
df.loc['20130102':'20130104', 'A':'C']

Unnamed: 0,A,B,C
2013-01-02,-2.764424,1.71949,0.216518
2013-01-03,0.978068,0.620083,1.750997
2013-01-04,-0.216042,-0.495686,2.036863


In [41]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,-2.764424,0.216518
2013-01-03,0.978068,1.750997
2013-01-04,-0.216042,2.036863


In [46]:
df.loc[dates[0],'A']

1.3584686410474351

In [47]:
# .at: scalar 가져올 때 .loc 보다 더 빠른 방법
df.at[dates[0],'A']

1.3584686410474351

## 좌표값으로 가져오기

In [50]:
df.iloc[2]

A    0.978068
B    0.620083
C    1.750997
D   -1.770599
Name: 2013-01-03 00:00:00, dtype: float64

In [51]:
df.iloc[3:5]  # df[3:5] / df['20130104':'20130105']

Unnamed: 0,A,B,C,D
2013-01-04,-0.216042,-0.495686,2.036863,0.189982
2013-01-05,0.021969,0.153268,0.160588,1.686423


In [54]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.216042,-0.495686
2013-01-05,0.021969,0.153268


In [55]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-2.764424,0.216518
2013-01-03,0.978068,1.750997
2013-01-05,0.021969,0.160588


In [56]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-2.764424,1.71949,0.216518,-0.373867
2013-01-03,0.978068,0.620083,1.750997,-1.770599


In [57]:
df.iloc[1,1]

1.7194904672215248

In [58]:
# .iat: scalar 가져올 때 .iloc 보다 더 빠른 방법
df.iat[1,1]

1.7194904672215248

## 조건에 해당하는 값 가져오기