# 시계열 데이터 전처리 방법
본문예제11-01

In [2]:
# 라이브러리 로드
import pandas as pd
import numpy as np
import pendulum

In [39]:
# DataFrame 생성
df_example01 = pd.DataFrame({'date':['2019-01-03', '2021-11-22', '2023-01-05'], 'name':['J', 'Y', 'O']})
df_example01.info()
df_example01

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3 non-null      object
 1   name    3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


Unnamed: 0,date,name
0,2019-01-03,J
1,2021-11-22,Y
2,2023-01-05,O


본문예제11-02

In [50]:
# Pandas Datetime으로 변경
df_example01['date_pandas'] = pd.to_datetime(df_example01.date, format='%Y-%m-%d')
df_example01['date_pendulum'] = df_example01.apply(lambda x: pendulum.parse(x['date']), axis=1)
df_example01.info(), df_example01

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype                          
---  ------         --------------  -----                          
 0   date           3 non-null      object                         
 1   name           3 non-null      object                         
 2   date_pandas    3 non-null      datetime64[ns]                 
 3   date_pendulum  3 non-null      datetime64[ns, Timezone('UTC')]
dtypes: datetime64[ns, Timezone('UTC')](1), datetime64[ns](1), object(2)
memory usage: 224.0+ bytes


(None,
          date name date_pandas             date_pendulum
 0  2019-01-03    J  2019-01-03 2019-01-03 00:00:00+00:00
 1  2021-11-22    Y  2021-11-22 2021-11-22 00:00:00+00:00
 2  2023-01-05    O  2023-01-05 2023-01-05 00:00:00+00:00)

본문예제11-03

In [5]:
# Date를 인덱스로 설정
df_example01_01 = df_example01.set_index(keys=['date'])
df_example01_01.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3 entries, 2019-01-03 to 2023-01-05
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
dtypes: object(1)
memory usage: 48.0+ bytes


본문예제11-04

In [6]:
# 결측값이 있는 DataFrame 생성과 결측값 확인
df_example02 = pd.DataFrame({'date':['2019-01-03', '2021-11-22','2021-12-01','2023-01-05'],'x1':[0.1,2.0,np.nan,1.2]})
df_example02['date'] = pd.to_datetime(df_example02.date, format='%Y-%m-%d')
df_example02_01 = df_example02.set_index(keys=['date'])
df_example02_01.isnull().sum(), df_example02_01

(x1    1
 dtype: int64,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  NaN
 2023-01-05  1.2)

본문예제11-05

In [7]:
# 가장 최근 값으로 결측치 채우
df_example02_ffill = df_example02_01.fillna(method='ffill')
print('=='*10)
df_example02, df_example02_01, df_example02_ffill



(        date   x1
 0 2019-01-03  0.1
 1 2021-11-22  2.0
 2 2021-12-01  NaN
 3 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  NaN
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  2.0
 2023-01-05  1.2)

본문예제11-06

In [8]:
# 결측치 제거
df_example02_drop = df_example02_01.dropna()
df_example02, df_example02_01, df_example02_ffill, df_example02_drop

(        date   x1
 0 2019-01-03  0.1
 1 2021-11-22  2.0
 2 2021-12-01  NaN
 3 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  NaN
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  2.0
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2023-01-05  1.2)

본문예제 11-07

In [9]:
# 결측치 전후 값의 평균으로 결측값 채우기
df_example02_interpolate = df_example02_01.interpolate()
df_example02, df_example02_01, df_example02_ffill, df_example02_drop, df_example02_interpolate

(        date   x1
 0 2019-01-03  0.1
 1 2021-11-22  2.0
 2 2021-12-01  NaN
 3 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  NaN
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  2.0
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2023-01-05  1.2,
              x1
 date           
 2019-01-03  0.1
 2021-11-22  2.0
 2021-12-01  1.6
 2023-01-05  1.2)

본문예제11-08

In [14]:
# index 속성으로 빈도 확인
df_example03 = pd.DataFrame({'date':['2019-01-03', '2021-11-22','2021-12-01','2023-01-05'], 'x1':[0.1,2.0,1.6,1.2]})
df_example03['date'] = pd.to_datetime(df_example03.date,format='%Y-%m-%d')
df_example03_01 = df_example03.set_index(keys=['date'])
#인덱스 속성 확인
print(df_example03_01.index)
df_example03_01

DatetimeIndex(['2019-01-03', '2021-11-22', '2021-12-01', '2023-01-05'], dtype='datetime64[ns]', name='date', freq=None)


Unnamed: 0_level_0,x1
date,Unnamed: 1_level_1
2019-01-03,0.1
2021-11-22,2.0
2021-12-01,1.6
2023-01-05,1.2


본문예제11-09

In [12]:
df_example03_01
df_example03_02 = df_example03_01.asfreq('Y', method='ffill') #‘Y’ 매년 마지막 일
print(df_example03_02) # 설정한 주기로 생성된 데이터


            x1
date          
2019-12-31 NaN
2020-12-31 NaN
2021-12-31 NaN
2022-12-31 NaN


본문예제11-10

In [28]:
# window별 연산을 적용 가능함 Moving Average
df_example04 = pd.DataFrame({'date':['2021-01-06', '2021-01-13', '2021-01-20', '2021-01-27', '2021-02-03'],
                             'x1':[5, 4, 3, 2, 7]})
df_example04['date'] = pd.to_datetime(df_example04.date)
df_example04_01 = df_example04.set_index(keys=['date'])

df_rolling_mean = pd.concat([df_example04_01, df_example04_01.rolling(2).mean(), df_example04_01.rolling(3).mean()], axis=1)
df_rolling_mean.columns = ['original', 'window_1', 'window_2']

df_rolling_sum = pd.concat([df_example04_01, df_example04_01.rolling(2).sum(), df_example04_01.rolling(3).sum()], axis=1)
df_rolling_sum.columns = ['original', 'window_1', 'window_2']


In [29]:
df_rolling_mean, df_rolling_sum

(            original  window_1  window_2
 date                                    
 2021-01-06         5       NaN       NaN
 2021-01-13         4       4.5       NaN
 2021-01-20         3       3.5       4.0
 2021-01-27         2       2.5       3.0
 2021-02-03         7       4.5       4.0,
             original  window_1  window_2
 date                                    
 2021-01-06         5       NaN       NaN
 2021-01-13         4       9.0       NaN
 2021-01-20         3       7.0      12.0
 2021-01-27         2       5.0       9.0
 2021-02-03         7       9.0      12.0)

본문예제11-11

In [27]:
df_example05 = pd.DataFrame({'date':['2021-01-06', '2021-01-13', '2021-01-20', '2021-01-27', '2021-02-03'],
                             'x1':[5, 4, 3, 2, 7],
                             'x2':[3, 4, 5, 1, 9]})
df_example05['date'] = pd.to_datetime(df_example05.date)
df_example05_01 = df_example05.set_index(keys=['date'])


df_example05_01_diff = df_example05_01.diff() # 이전값-이후값
df_example05_01_diff.columns=['x1_diff', 'x2_diff'] # 컬럼명 만들기
df_example05_02 = pd.concat([df_example05_01, df_example05_01_diff], axis=1) # 데이터프레임 열끼리 합치기
df_example05_02

Unnamed: 0_level_0,x1,x2,x1_diff,x2_diff
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-06,5,3,,
2021-01-13,4,4,-1.0,1.0
2021-01-20,3,5,-1.0,1.0
2021-01-27,2,1,-1.0,-4.0
2021-02-03,7,9,5.0,8.0


본문예제11-12

In [32]:
import numpy as np
df_example06 = pd.DataFrame({'date':['2021-01-06', '2021-01-13', '2021-01-20', '2021-01-27', '2021-02-03'],
                             'x1':[5, 4, 3, 2, 7],
                             'x2':[3, 4, 5, 1, 9]})
df_example06['date'] = pd.to_datetime(df_example06.date)
df_example06_01 = df_example06.set_index(keys=['date'])
df_example06_01['shift'] = df_example06_01['x1'].shift(2)
df_example06_02 = df_example06_01.fillna(method='bfill')
df_example06_01, df_example06_02

(            x1  x2  shift
 date                     
 2021-01-06   5   3    NaN
 2021-01-13   4   4    NaN
 2021-01-20   3   5    5.0
 2021-01-27   2   1    4.0
 2021-02-03   7   9    3.0,
             x1  x2  shift
 date                     
 2021-01-06   5   3    5.0
 2021-01-13   4   4    5.0
 2021-01-20   3   5    5.0
 2021-01-27   2   1    4.0
 2021-02-03   7   9    3.0)

본문예제11-12

In [33]:
df_example07 = pd.DataFrame({'date':['2021-01-06', '2021-01-13', '2021-01-20', '2021-01-27', '2021-02-03'],
                              'x1':[5, 4, 3, 2, 7],
                              '과목':['a', 'b', 'c', 'd', 'e']})
df_example07['date'] = pd.to_datetime(df_example07.date)
df_example07_01 = df_example07.set_index(keys=['date'])
df_example07_dummy = pd.get_dummies(df_example07_01['과목'])
df_example07_02 = pd.concat([df_example07_01,df_example07_dummy], axis=1)
df_example07_02

Unnamed: 0_level_0,x1,과목,a,b,c,d,e
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-06,5,a,1,0,0,0,0
2021-01-13,4,b,0,1,0,0,0
2021-01-20,3,c,0,0,1,0,0
2021-01-27,2,d,0,0,0,1,0
2021-02-03,7,e,0,0,0,0,1
