In [1]:
import numpy as np
import pandas as pd


# load datasets
DATA_PATH = './COVID_infection.csv'
df = pd.read_csv(DATA_PATH, index_col=0)

# 불필요 칼럼 데이터 제거
df.drop(columns=['seq', 'createDt', 'stateTime', 'updateDt'], inplace=True)

# 날짜 타입 변환
df['stateDt'] = pd.to_datetime(df['stateDt'], format='%Y%m%d')

# 칼럼명 변경
df = df.rename({'deathCnt': 'accDeathCnt', 
                'decideCnt': 'accDecideCnt',
                'clearCnt': 'accClearCnt',
                'stateDt': 'date'}, 
               axis='columns')

# 데이터 날짜 오름차 순으로 정렬
df = df.sort_values(by=['date'], ascending=True).reset_index(drop=True)

df

Unnamed: 0,accDefRate,accExamCnt,accExamCompCnt,careCnt,accClearCnt,accDeathCnt,accDecideCnt,examCnt,resutlNegCnt,date
0,25.000000,4,4,1,0,0,1,0,3,2020-01-20
1,6.666667,15,15,1,0,0,1,0,14,2020-01-21
2,4.761905,21,21,1,0,0,1,0,20,2020-01-22
3,4.000000,25,25,1,0,0,1,0,24,2020-01-23
4,6.896552,29,29,2,0,0,2,0,27,2020-01-24
...,...,...,...,...,...,...,...,...,...,...
646,2.508290,15678186,14289056,24750,330853,2808,358411,1389130,13930645,2021-10-28
647,2.508577,15730785,14372093,24723,332995,2817,360535,1358692,14011558,2021-10-29
648,2.516707,15775700,14409266,25228,334581,2830,362639,1366434,14046627,2021-10-30
649,2.529634,15804065,14417103,25303,336548,2849,364700,1386962,14052403,2021-10-31


In [2]:
# 일별 확진, 사망, 치유 칼럼 추가
add_columns = ['decideCnt', 'deathCnt', 'clearCnt']
for c in add_columns:
    df[c] = np.nan

# 일별 확진, 사망, 치유 데이터 추가
for i, d in enumerate(df.to_dict('records')):
    if i == 0:
        df.loc[i, 'decideCnt'] = d['accDecideCnt']
        df.loc[i, 'deathCnt'] = d['accDeathCnt']
        df.loc[i, 'clearCnt'] = d['accClearCnt']
    else:
        df.loc[i, 'decideCnt'] = d['accDecideCnt'] - pre_d['accDecideCnt']
        df.loc[i, 'deathCnt'] = d['accDeathCnt'] - pre_d['accDeathCnt']
        df.loc[i, 'clearCnt'] = d['accClearCnt'] - pre_d['accClearCnt']
    pre_d = d

# 데이터 타입 변경
df = df.astype({'decideCnt': int, 'deathCnt': int, 'clearCnt': int})

# 순서 재배열
df = df[['date', 'decideCnt', 'deathCnt', 'careCnt', 'clearCnt', 'examCnt', 'resutlNegCnt',
         'accDecideCnt', 'accDeathCnt', 'accClearCnt', 'accExamCnt', 'accExamCompCnt', 'accDefRate']]
df

Unnamed: 0,date,decideCnt,deathCnt,careCnt,clearCnt,examCnt,resutlNegCnt,accDecideCnt,accDeathCnt,accClearCnt,accExamCnt,accExamCompCnt,accDefRate
0,2020-01-20,1,0,1,0,0,3,1,0,0,4,4,25.000000
1,2020-01-21,0,0,1,0,0,14,1,0,0,15,15,6.666667
2,2020-01-22,0,0,1,0,0,20,1,0,0,21,21,4.761905
3,2020-01-23,0,0,1,0,0,24,1,0,0,25,25,4.000000
4,2020-01-24,1,0,2,0,0,27,2,0,0,29,29,6.896552
...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,2021-10-28,2111,11,24750,1195,1389130,13930645,358411,2808,330853,15678186,14289056,2.508290
647,2021-10-29,2124,9,24723,2142,1358692,14011558,360535,2817,332995,15730785,14372093,2.508577
648,2021-10-30,2104,13,25228,1586,1366434,14046627,362639,2830,334581,15775700,14409266,2.516707
649,2021-10-31,2061,19,25303,1967,1386962,14052403,364700,2849,336548,15804065,14417103,2.529634


In [3]:
# 가공된 데이터 저장
df.to_csv('COVID_infection_prc.csv')

In [4]:
# 데이터 통계치 확인
df.describe()

Unnamed: 0,decideCnt,deathCnt,careCnt,clearCnt,examCnt,resutlNegCnt,accDecideCnt,accDeathCnt,accClearCnt,accExamCnt,accExamCompCnt,accDefRate
count,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0,651.0
mean,562.803379,4.390169,8677.371736,518.207373,190441.2,5157950.0,87799.399386,1068.284178,78053.743472,5436190.0,5245749.0,1.683283
std,630.553148,5.299477,9056.226134,624.90959,335670.5,4462807.0,96027.162951,876.009163,87091.728375,4812631.0,4554836.0,1.239769
min,0.0,0.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,4.0,4.0,0.378833
25%,62.0,1.0,1554.0,65.5,20309.5,1258066.0,12877.0,282.0,11646.0,1290596.0,1270942.0,1.071951
50%,390.0,3.0,6565.0,331.0,66267.0,3192745.0,40778.0,572.0,31147.0,3311211.0,3233523.0,1.418678
75%,700.5,6.0,9627.5,688.0,136896.5,9273100.0,135634.5,1928.5,125427.5,9528632.0,9408735.0,1.865157
max,3270.0,40.0,36738.0,3822.0,1424100.0,14058820.0,366385.0,2858.0,337353.0,15849300.0,14425200.0,25.0
