# Time Series의 Feature Engineering

In [17]:
import pandas as pd
import numpy as np
from datetime import datetime

df = pd.DataFrame()
df['date'] = [datetime(2021, 12, 1), datetime(2021, 12, 2), 
                  datetime(2021, 12, 3), datetime(2021, 12, 4), datetime(2021, 12, 5)]
df['birth'] = [35, 32, 30, 31, 44]
df

Unnamed: 0,date,birth
0,2021-12-01,35
1,2021-12-02,32
2,2021-12-03,30
3,2021-12-04,31
4,2021-12-05,44


## date time feature  

- 각 observation 자체의 time step 정보

In [18]:
features = df.copy()
features['year']    = df['date'].dt.year
features['month'] = df['date'].dt.month
features['day']     = df['date'].dt.day
features

Unnamed: 0,date,birth,year,month,day
0,2021-12-01,35,2021,12,1
1,2021-12-02,32,2021,12,2
2,2021-12-03,30,2021,12,3
3,2021-12-04,31,2021,12,4
4,2021-12-05,44,2021,12,5


## Lag features (지연 특성)

- 이전 time step의 values

In [19]:
features['lag1'] = df['birth'].shift(1)
features['lag2'] = df['birth'].shift(3)
features

Unnamed: 0,date,birth,year,month,day,lag1,lag2
0,2021-12-01,35,2021,12,1,,
1,2021-12-02,32,2021,12,2,35.0,
2,2021-12-03,30,2021,12,3,32.0,
3,2021-12-04,31,2021,12,4,30.0,35.0
4,2021-12-05,44,2021,12,5,31.0,32.0


## Window features

### Rolling Window vs. Expanding Window

rolling window에서는 창 크기가 일정하게 유지되는 반면 expanding window에서는 변경됩니다.  

예) 100일 간의 데이터가 있다고 가정  

rolling window : 창 크기가 10이라고 가정할 경우, 첫 번째 예측은 (이전) 10일의 데이터를 사용하여 11일의 데이터를 예측.  다음 예측의 경우 2일째(데이터 포인트)에서 11일째의 데이터를 사용.

expanding window : 첫 번째 예측의 경우 10일 간의 데이터를 사용. 그러나 두 번째 예측의 경우 10 + 1일의 데이터를 사용. 

In [20]:
features['Roll_mean'] = df['birth'].rolling(window=2).mean()
features['Roll_max']  = df['birth'].rolling(window=3).max()
features

Unnamed: 0,date,birth,year,month,day,lag1,lag2,Roll_mean,Roll_max
0,2021-12-01,35,2021,12,1,,,,
1,2021-12-02,32,2021,12,2,35.0,,33.5,
2,2021-12-03,30,2021,12,3,32.0,,31.0,35.0
3,2021-12-04,31,2021,12,4,30.0,35.0,30.5,32.0
4,2021-12-05,44,2021,12,5,31.0,32.0,37.5,44.0


In [24]:
df['birth'].expanding()       #min_period : aggregate 하는데 필요한 최소한의 이전 date

Expanding [min_periods=1,center=False,axis=0,method=single]

In [25]:
features['Expand_max'] = df['birth'].expanding().max()  #current date까지의 최대값
features

Unnamed: 0,date,birth,year,month,day,lag1,lag2,Roll_mean,Roll_max,Expand_max
0,2021-12-01,35,2021,12,1,,,,,35.0
1,2021-12-02,32,2021,12,2,35.0,,33.5,,35.0
2,2021-12-03,30,2021,12,3,32.0,,31.0,35.0,35.0
3,2021-12-04,31,2021,12,4,30.0,35.0,30.5,32.0,35.0
4,2021-12-05,44,2021,12,5,31.0,32.0,37.5,44.0,44.0


## Downsampling and Upsampling

time seires data를 분석할 때 불규칙적으로 수집되는 데이터의 특성상 주기가 다소 불규칙적일 때가 많아서 주기를 일정하게 변경해야할 필요성이 있다. 불규칙 적인 time seires data를 주기가 일정하게 변경하는 방법은 upsampling, downsampling 두가지가 있다.  

(1) 다운샘플링 : 데이터의 빈도를 줄이는 것   

    - 원본 데이터의 시간 단위가 실용적이지 않은 경우  
    - 특정 주기에 집중하는 경우  
    - 더 낮은 빈도의 데이터에 맞추는 경우  
    
수집 된 데이터가 우리가 핸들링 하기에 적당하지 않은 경우 초 -> 분, 분 -> 시 등으로 더 높은 차원의 시간 대로 resampling 한다고 생각하면 된다. 

(2) 업샘플링 : 실제 데이터를 바탕으로 더 조밀한 시간의 데이터를 얻기 위해 데이터를 생성하는 것

    - 시계열이 불규칙적인 상황
    - 입력이 서로 다른 빈도로 샘플링 된 상황

샘플링은 다운샘플링과 반대로 분 -> 초 , 5초 -> 1초 와 같이 더 높은 시간의 단위에서 더 조밀한 시간 데이터를 얻을 수 있다.

In [28]:
np.random.seed(0)
rng = pd.date_range('2015-02-24', periods=10, freq='T')   # minutely frequency
df = pd.DataFrame({'Val' : np.random.randn(len(rng))}, index=rng)  
df

Unnamed: 0,Val
2015-02-24 00:00:00,1.764052
2015-02-24 00:01:00,0.400157
2015-02-24 00:02:00,0.978738
2015-02-24 00:03:00,2.240893
2015-02-24 00:04:00,1.867558
2015-02-24 00:05:00,-0.977278
2015-02-24 00:06:00,0.950088
2015-02-24 00:07:00,-0.151357
2015-02-24 00:08:00,-0.103219
2015-02-24 00:09:00,0.410599


### Downsampling

pd.date_range(start, end, periods, freq)  
[frequency alias](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases) 참조

In [29]:
# create data
index = pd.date_range('2000-1-1', periods=9, freq='T')  # minutely frequency
series = pd.Series(range(9), index=index)
series

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

In [30]:
# resamling 3T
series.resample('3T').sum()

2000-01-01 00:00:00     3
2000-01-01 00:03:00    12
2000-01-01 00:06:00    21
Freq: 3T, dtype: int64

### Upsampling

In [32]:
# create data
rng = pd.date_range('2019-12-31', periods=3, freq='5S')  # 5 second 간격
ts = pd.DataFrame(np.array([0, 1, 3, 2, 10, 3]).reshape(3, 2), 
                  index=rng, 
                  columns=['col_1', 'col_2'])
ts

Unnamed: 0,col_1,col_2
2019-12-31 00:00:00,0,1
2019-12-31 00:00:05,3,2
2019-12-31 00:00:10,10,3


In [33]:
ts_upsample = ts.resample('S').mean()
ts_upsample

Unnamed: 0,col_1,col_2
2019-12-31 00:00:00,0.0,1.0
2019-12-31 00:00:01,,
2019-12-31 00:00:02,,
2019-12-31 00:00:03,,
2019-12-31 00:00:04,,
2019-12-31 00:00:05,3.0,2.0
2019-12-31 00:00:06,,
2019-12-31 00:00:07,,
2019-12-31 00:00:08,,
2019-12-31 00:00:09,,


In [34]:
# 앞의 값으로 뒤의 값 채우기
ts_upsample.ffill()
ts_upsample.fillna(method='ffill')

Unnamed: 0,col_1,col_2
2019-12-31 00:00:00,0.0,1.0
2019-12-31 00:00:01,0.0,1.0
2019-12-31 00:00:02,0.0,1.0
2019-12-31 00:00:03,0.0,1.0
2019-12-31 00:00:04,0.0,1.0
2019-12-31 00:00:05,3.0,2.0
2019-12-31 00:00:06,3.0,2.0
2019-12-31 00:00:07,3.0,2.0
2019-12-31 00:00:08,3.0,2.0
2019-12-31 00:00:09,3.0,2.0


In [35]:
# 뒤의 값으로 앞의 값 채우기
ts_upsample.fillna(method='bfill')

Unnamed: 0,col_1,col_2
2019-12-31 00:00:00,0.0,1.0
2019-12-31 00:00:01,3.0,2.0
2019-12-31 00:00:02,3.0,2.0
2019-12-31 00:00:03,3.0,2.0
2019-12-31 00:00:04,3.0,2.0
2019-12-31 00:00:05,3.0,2.0
2019-12-31 00:00:06,10.0,3.0
2019-12-31 00:00:07,10.0,3.0
2019-12-31 00:00:08,10.0,3.0
2019-12-31 00:00:09,10.0,3.0


[선형보간법](https://ko.wikipedia.org/wiki/%EC%84%A0%ED%98%95_%EB%B3%B4%EA%B0%84%EB%B2%95)

In [14]:
# 선형보간
ts_upsample.interpolate(method='values')

Unnamed: 0,col_1,col_2
2019-12-31 00:00:00,0.0,1.0
2019-12-31 00:00:01,0.6,1.2
2019-12-31 00:00:02,1.2,1.4
2019-12-31 00:00:03,1.8,1.6
2019-12-31 00:00:04,2.4,1.8
2019-12-31 00:00:05,3.0,2.0
2019-12-31 00:00:06,4.4,2.2
2019-12-31 00:00:07,5.8,2.4
2019-12-31 00:00:08,7.2,2.6
2019-12-31 00:00:09,8.6,2.8
