<a href="https://colab.research.google.com/github/jeawonlll/KOHI2022_tutorials/blob/main/20220903/%5Bopen%5D_00_basic_pandas_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminaries I : Pandas DataFrame
- Date : Sep. 03, 2022
- Author : **Hyun-Lim Yang, Ph.D.**<br>
Research Assistant Professor @
Seoul National University Hospital <br>
Department of Anesthesiology and Pain Medicine
- E-mail : hlyang{_at_}snu{_dot_}ac{_dot_}kr
***


## Import packages

In [None]:
import pandas as pd
import numpy as np

## Pandas Data structure
### DataFrame
> - R의 dataframe  데이터 타입과 유사
> - Tabular data type 을 지원하여 정형 데이터를 분석하기 쉬움
> - Columns / Row (data) / Index 로 구성


In [None]:
df_dic = pd.DataFrame({"A":[1,4,7], "B":[2,5,8], "C":[3,6,9]})
df_dic

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
df_np = pd.DataFrame(np.array([[1,2,3], [4,5,6], [7,8,9]])) #numpy 로 선언하는 방법
df_np

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


## DataFrame indexing
### Index by columns
> - 컬럼 이름으로 바로 인덱싱 가능

In [None]:
df_dic['A']

0    1
1    4
2    7
Name: A, dtype: int64

In [None]:
df_dic[['A', 'B']]

Unnamed: 0,A,B
0,1,2
1,4,5
2,7,8


### index by condition
> - 조건을 통해 row(data) 를 indexing 가능

In [None]:
row_index = np.array([True, False, True])
df_dic[row_index]

Unnamed: 0,A,B,C
0,1,2,3
2,7,8,9


In [None]:
df_dic.index
#df_dic[df_dic.index]

RangeIndex(start=0, stop=3, step=1)

In [None]:
df_dic[df_dic.index % 2 == 0]

Unnamed: 0,A,B,C
0,1,2,3
2,7,8,9


## DataFrame attributes
### Get index of DataFrame
> - `.index` : DataFrame의 index를 반환함

In [None]:
df_np.index

RangeIndex(start=0, stop=3, step=1)

In [None]:
list(df_np.index)

[0, 1, 2]

### Get value of DataFrame
> - `.values` : DataFrame의 값을 array 형태로 반환함

In [None]:
df_np.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [None]:
df_dic['A'].values

array([1, 4, 7])

In [None]:
df_dic[['A', 'C']].values

array([[1, 3],
       [4, 6],
       [7, 9]])

## DataFrame functions
### notnull()
> - `na` 혹은 `NaN` 값 비교를 하여 boolean을 반환함 (없으면 `True`)

In [None]:
df_na = pd.DataFrame({"A":[1,np.nan,7], 
                      "B":[2,5,np.nan], 
                      "C":[3,6,9]})
df_na

Unnamed: 0,A,B,C
0,1.0,2.0,3
1,,5.0,6
2,7.0,,9


In [None]:
df_na.notnull()

Unnamed: 0,A,B,C
0,True,True,True
1,False,True,True
2,True,False,True


In [None]:
df_na[df_na.notnull()]

Unnamed: 0,A,B,C
0,1.0,2.0,3
1,,5.0,6
2,7.0,,9


In [None]:
df_na[df_na['A'].notnull() & df_na['B'].notnull()]

Unnamed: 0,A,B,C
0,1.0,2.0,3


### fillna()
> - `na` 혹은 `NaN` 값 위치에 모두 특정 값을 대입함

In [None]:
df_fna = pd.DataFrame({"A":[1,np.nan,7], 
                      "B":[2,5,np.nan], 
                      "C":[3,6,9]})
df_fna

Unnamed: 0,A,B,C
0,1.0,2.0,3
1,,5.0,6
2,7.0,,9


In [None]:
df_fna.fillna(0)

Unnamed: 0,A,B,C
0,1.0,2.0,3
1,0.0,5.0,6
2,7.0,0.0,9


In [None]:
df_fna.fillna('not available')

Unnamed: 0,A,B,C
0,1.0,2.0,3
1,not available,5.0,6
2,7.0,not available,9
