## reindex of Time Series data

In [77]:
import pandas as pd
import numpy as np
import random

In [38]:
date_idx = pd.date_range('2018-01-03', periods=5, freq='D')

In [39]:
date_idx

DatetimeIndex(['2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06',
               '2018-01-07'],
              dtype='datetime64[ns]', freq='D')

In [40]:
data = {"a" : np.arange(11,16)}

In [41]:
data

{'a': array([11, 12, 13, 14, 15])}

In [42]:
df1 = pd.DataFrame(data, index=date_idx)

In [43]:
df1.head(2)

Unnamed: 0,a
2018-01-03,11
2018-01-04,12


위에서 만든 시계열 데이터 DataFrame 의 date 앞/뒤로 reindex 를 사용해서 날짜 몇 개를 새로 추가

In [55]:
date_idx2 = pd.date_range(start='2018-01-01',periods=10, freq='D')

In [56]:
df1.reindex(date_idx2, method="ffill")

Unnamed: 0,a
2018-01-01,
2018-01-02,
2018-01-03,11.0
2018-01-04,12.0
2018-01-05,13.0
2018-01-06,14.0
2018-01-07,15.0
2018-01-08,15.0
2018-01-09,15.0
2018-01-10,15.0


In [58]:
df1.reindex(date_idx2, method="bfill")

Unnamed: 0,a
2018-01-01,11.0
2018-01-02,11.0
2018-01-03,11.0
2018-01-04,12.0
2018-01-05,13.0
2018-01-06,14.0
2018-01-07,15.0
2018-01-08,
2018-01-09,
2018-01-10,


## resampling time series data

In [104]:
date_idx = pd.date_range('2017-01-01', periods=30, freq='15min')

In [105]:
date_idx

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 00:15:00',
               '2017-01-01 00:30:00', '2017-01-01 00:45:00',
               '2017-01-01 01:00:00', '2017-01-01 01:15:00',
               '2017-01-01 01:30:00', '2017-01-01 01:45:00',
               '2017-01-01 02:00:00', '2017-01-01 02:15:00',
               '2017-01-01 02:30:00', '2017-01-01 02:45:00',
               '2017-01-01 03:00:00', '2017-01-01 03:15:00',
               '2017-01-01 03:30:00', '2017-01-01 03:45:00',
               '2017-01-01 04:00:00', '2017-01-01 04:15:00',
               '2017-01-01 04:30:00', '2017-01-01 04:45:00',
               '2017-01-01 05:00:00', '2017-01-01 05:15:00',
               '2017-01-01 05:30:00', '2017-01-01 05:45:00',
               '2017-01-01 06:00:00', '2017-01-01 06:15:00',
               '2017-01-01 06:30:00', '2017-01-01 06:45:00',
               '2017-01-01 07:00:00', '2017-01-01 07:15:00'],
              dtype='datetime64[ns]', freq='15T')

In [106]:
df = pd.DataFrame(index=date_idx)

In [107]:
df.head(3)

2017-01-01 00:00:00
2017-01-01 00:15:00
2017-01-01 00:30:00


In [139]:
df['speed'] = np.random.randint(low=0, high=100,size=len(df.index))
df['distance'] = df.speed * 0.25
df['cumulative_distance'] = df.distance.cumsum()

In [140]:
df.head(3)

Unnamed: 0,speed,distance,cumulative distance,cumulative_distance
2017-01-01 00:00:00,79,19.75,14.25,19.75
2017-01-01 00:15:00,5,1.25,38.5,21.0
2017-01-01 00:30:00,46,11.5,52.25,32.5


speed column 을 주단위로 resample 하여 mean() 으로 표시

In [141]:
weekly_summary = pd.DataFrame()

In [142]:
weekly_summary

In [143]:
weekly_summary['speed'] = df.speed.resample('W').mean()

In [144]:
weekly_summary['distance'] = df.distance.resample('W').sum()

In [145]:
weekly_summary['cumulative_distance'] = df.cumulative_distance.resample('W').last()

In [147]:
weekly_summary

Unnamed: 0,speed,distance,cumulative_distance
2017-01-01,48.733333,365.5,365.5


## cumprod & cumsum 차이

In [72]:
a = np.array([1,2,3, 4])

In [75]:
np.cumprod(a)          # 1, 1*2, 1*2*3, 1*2*3*4

array([ 1,  2,  6, 24], dtype=int32)

In [76]:
np.cumsum(a)          # 1, 1+2, 1+2+3, 1+2+3+4

array([ 1,  3,  6, 10], dtype=int32)