In [1]:
import pandas as pd
import numpy as np

### `Datetima` 인덱스
시계열 객체인 `Series`는 시계열 데이터를 관리하는 객체이기 때문에 데이터가 시간에 의해 관리되어져야 함  
지금까지는 단순한 인덱스로 사용하였는데 이를 시계열로 표현하기 위해서 `DatetimeIndex`를 사용해야 함  
`DatetimeIndex`는 시계열 인덱스를 사용하기 위한 인덱스이며 `판다스.to_datetime`, `판다스.date_range`메서드를 사용하여 생성할 수 있음

`to_datetime()` 메서드는 날짜 및 시간을 나타내는 문자열을 Datetime 자료형으로 변경한 후 `DateTimeIndex`를 생성함  

In [2]:
date_str = ['2018, 1, 1', '2018, 1, 4', '2018, 1, 5', '2018, 1, 6']
idx = pd.to_datetime(date_str)
idx

DatetimeIndex(['2018-01-01', '2018-01-04', '2018-01-05', '2018-01-06'], dtype='datetime64[ns]', freq=None)

In [3]:
# 왜 안되노??
# date_str = ['2018, 1, 1', '2018, 1, 4', '2018, 1, 5', ' 2018, 1, 6']
# idx = pd.to_datetime(date_str)
# idx

In [4]:
# 인덱스 타입: datetime 타입
s = pd.Series(np.random.randn(4), index=idx)
s

2018-01-01    2.352441
2018-01-04   -1.504528
2018-01-05   -0.083433
2018-01-06    1.712149
dtype: float64

In [5]:
# 인덱스 타입: string 타입
s = pd.Series(np.random.randn(4), index=date_str)
s

2018, 1, 1    0.500933
2018, 1, 4    0.666422
2018, 1, 5    1.041778
2018, 1, 6    2.625952
dtype: float64

`date_range` 메서드는 날짜 및 시간에 대해서 시작일과 종료일 / 시작일과 기간을 입력하여 범위 내의 `DatetimeIndex`를 생성

In [6]:
pd.date_range('2024-01-01', '2024-03-31')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12',
               '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16',
               '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20',
               '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24',
               '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28',
               '2024-01-29', '2024-01-30', '2024-01-31', '2024-02-01',
               '2024-02-02', '2024-02-03', '2024-02-04', '2024-02-05',
               '2024-02-06', '2024-02-07', '2024-02-08', '2024-02-09',
               '2024-02-10', '2024-02-11', '2024-02-12', '2024-02-13',
               '2024-02-14', '2024-02-15', '2024-02-16', '2024-02-17',
               '2024-02-18', '2024-02-19', '2024-02-20', '2024-02-21',
               '2024-02-22', '2024-02-23', '2024-02-24', '2024-02-25',
      

In [7]:
pd.date_range('2024-01-01', periods=30)

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12',
               '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16',
               '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20',
               '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24',
               '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28',
               '2024-01-29', '2024-01-30'],
              dtype='datetime64[ns]', freq='D')

`freq` 인수로 특정한 날짜만 생성되도록 할 수 있음  
종류가 훨신 많음

In [8]:
# s:초
pd.date_range('2024-02-29 09:48:00', '2024-02-29 09:48:10', freq='s')

DatetimeIndex(['2024-02-29 09:48:00', '2024-02-29 09:48:01',
               '2024-02-29 09:48:02', '2024-02-29 09:48:03',
               '2024-02-29 09:48:04', '2024-02-29 09:48:05',
               '2024-02-29 09:48:06', '2024-02-29 09:48:07',
               '2024-02-29 09:48:08', '2024-02-29 09:48:09',
               '2024-02-29 09:48:10'],
              dtype='datetime64[ns]', freq='s')

In [9]:
# B: 평일
pd.date_range('2024-02-01',  '2024-02-29', freq='B')

DatetimeIndex(['2024-02-01', '2024-02-02', '2024-02-05', '2024-02-06',
               '2024-02-07', '2024-02-08', '2024-02-09', '2024-02-12',
               '2024-02-13', '2024-02-14', '2024-02-15', '2024-02-16',
               '2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22',
               '2024-02-23', '2024-02-26', '2024-02-27', '2024-02-28',
               '2024-02-29'],
              dtype='datetime64[ns]', freq='B')

In [10]:
# Q-DEC: 각 분기별 마지막 날
pd.date_range('2024-02-01', '2024-12-31', freq='QE-DEC')

DatetimeIndex(['2024-03-31', '2024-06-30', '2024-09-30', '2024-12-31'], dtype='datetime64[ns]', freq='QE-DEC')

### `shift` 연산
`shift`메서드는 인덱스는 그대로 두고 데이터만 이동
`freq`인수를 지정하면 인덱슬 변경시킴

In [11]:
ts = pd.Series(np.random.randn(4), index=pd.date_range('2024-02-01', periods=4, freq='M'))
ts

  ts = pd.Series(np.random.randn(4), index=pd.date_range('2024-02-01', periods=4, freq='M'))


2024-02-29    1.554832
2024-03-31   -1.883745
2024-04-30   -1.178675
2024-05-31   -2.206498
Freq: ME, dtype: float64

In [12]:
ts.shift(1)

2024-02-29         NaN
2024-03-31    1.554832
2024-04-30   -1.883745
2024-05-31   -1.178675
Freq: ME, dtype: float64

In [13]:
ts.shift(-1)

2024-02-29   -1.883745
2024-03-31   -1.178675
2024-04-30   -2.206498
2024-05-31         NaN
Freq: ME, dtype: float64

In [14]:
ts.shift(1, freq='ME')

2024-03-31    1.554832
2024-04-30   -1.883745
2024-05-31   -1.178675
2024-06-30   -2.206498
Freq: ME, dtype: float64

### `resample` 메서드
`resample()`메서드는 `DatetimeIndex`의 시간 간격을 재조정함  
시간 간격이 좁은 단위로 변경하면 데이터 양이 증가해서 업-샘플링(up-sampling)이라 하고  넓은 단위로  
변경되면 데이터 양이 감소해서 다운-샘플링(down-sampling)이라 함

In [15]:
ts = pd.Series(np.random.randn(100), index = pd.date_range('2024-01-01', periods=100))
ts.tail(20)

2024-03-21   -0.028847
2024-03-22   -0.240537
2024-03-23    0.313806
2024-03-24   -0.712453
2024-03-25    2.271579
2024-03-26   -1.232945
2024-03-27    0.317209
2024-03-28    1.246495
2024-03-29   -0.193384
2024-03-30    0.208298
2024-03-31    1.458160
2024-04-01   -1.530031
2024-04-02    0.478561
2024-04-03    1.054848
2024-04-04    1.294086
2024-04-05    0.030287
2024-04-06    1.099892
2024-04-07   -1.195289
2024-04-08   -1.460278
2024-04-09   -0.683588
Freq: D, dtype: float64

다운-샘플링 시에는 기존 데이터가 그룹화되는 경우와 같기 때문에 대표값을 지정해야 함(집계처리를 해야함)

In [16]:
ts.resample('W')

<pandas.core.resample.DatetimeIndexResampler object at 0x000002A430F51DC0>

In [17]:
ts.resample('W').mean() # 평균값

2024-01-07   -0.017788
2024-01-14   -0.310657
2024-01-21   -0.937275
2024-01-28    0.162120
2024-02-04   -0.562524
2024-02-11    0.244615
2024-02-18   -0.379909
2024-02-25    0.084431
2024-03-03   -0.147466
2024-03-10    0.011064
2024-03-17   -0.266118
2024-03-24    0.134991
2024-03-31    0.582202
2024-04-07    0.176050
2024-04-14   -1.071933
Freq: W-SUN, dtype: float64

시간(시/분) 단위에서는 가장 빠른 값은 포함하고, 가장 늦은 값은 포함하지 않음

In [18]:
ts = pd.Series(np.arange(60), index=pd.date_range('2024-02-29', periods=60, freq='min'))
ts

2024-02-29 00:00:00     0
2024-02-29 00:01:00     1
2024-02-29 00:02:00     2
2024-02-29 00:03:00     3
2024-02-29 00:04:00     4
2024-02-29 00:05:00     5
2024-02-29 00:06:00     6
2024-02-29 00:07:00     7
2024-02-29 00:08:00     8
2024-02-29 00:09:00     9
2024-02-29 00:10:00    10
2024-02-29 00:11:00    11
2024-02-29 00:12:00    12
2024-02-29 00:13:00    13
2024-02-29 00:14:00    14
2024-02-29 00:15:00    15
2024-02-29 00:16:00    16
2024-02-29 00:17:00    17
2024-02-29 00:18:00    18
2024-02-29 00:19:00    19
2024-02-29 00:20:00    20
2024-02-29 00:21:00    21
2024-02-29 00:22:00    22
2024-02-29 00:23:00    23
2024-02-29 00:24:00    24
2024-02-29 00:25:00    25
2024-02-29 00:26:00    26
2024-02-29 00:27:00    27
2024-02-29 00:28:00    28
2024-02-29 00:29:00    29
2024-02-29 00:30:00    30
2024-02-29 00:31:00    31
2024-02-29 00:32:00    32
2024-02-29 00:33:00    33
2024-02-29 00:34:00    34
2024-02-29 00:35:00    35
2024-02-29 00:36:00    36
2024-02-29 00:37:00    37
2024-02-29 0

In [19]:
ts.resample('10min').min()

2024-02-29 00:00:00     0
2024-02-29 00:10:00    10
2024-02-29 00:20:00    20
2024-02-29 00:30:00    30
2024-02-29 00:40:00    40
2024-02-29 00:50:00    50
Freq: 10min, dtype: int32

In [20]:
ts.resample('10min').max()

2024-02-29 00:00:00     9
2024-02-29 00:10:00    19
2024-02-29 00:20:00    29
2024-02-29 00:30:00    39
2024-02-29 00:40:00    49
2024-02-29 00:50:00    59
Freq: 10min, dtype: int32

구간 한계 값을 가장 늦은 값으로 지정하고자 한다면 `closed`=`right`를 지정함

In [21]:
ts.resample('10min', closed='right').min()

2024-02-28 23:50:00     0
2024-02-29 00:00:00     1
2024-02-29 00:10:00    11
2024-02-29 00:20:00    21
2024-02-29 00:30:00    31
2024-02-29 00:40:00    41
2024-02-29 00:50:00    51
Freq: 10min, dtype: int32

In [22]:
ts.resample('10min', closed='right').max()

2024-02-28 23:50:00     0
2024-02-29 00:00:00    10
2024-02-29 00:10:00    20
2024-02-29 00:20:00    30
2024-02-29 00:30:00    40
2024-02-29 00:40:00    50
2024-02-29 00:50:00    59
Freq: 10min, dtype: int32

`ohlc()` 메서드는 구간의 시점, 고점, 저점, 종점의 값을 표현

In [23]:
ts.resample('10min').ohlc()

Unnamed: 0,open,high,low,close
2024-02-29 00:00:00,0,9,0,9
2024-02-29 00:10:00,10,19,10,19
2024-02-29 00:20:00,20,29,20,29
2024-02-29 00:30:00,30,39,30,39
2024-02-29 00:40:00,40,49,40,49
2024-02-29 00:50:00,50,59,50,59


In [24]:
ts.resample('10min', closed='right').ohlc()

Unnamed: 0,open,high,low,close
2024-02-28 23:50:00,0,0,0,0
2024-02-29 00:00:00,1,10,1,10
2024-02-29 00:10:00,11,20,11,20
2024-02-29 00:20:00,21,30,21,30
2024-02-29 00:30:00,31,40,31,40
2024-02-29 00:40:00,41,50,41,50
2024-02-29 00:50:00,51,59,51,59


업-샘플링은 데이터를 추가해야 하기 때문에 `foward filling` 방식과 `back filling` 방식을 사용할 수 있음  
`foward filling`은 이전 데이터를 가져와 사용하는 방법 - `ffill()`  
back filling은 다음 데이터를 가져와 사용하는 방법 - `bfill()`

In [25]:
ts.resample('30s').ffill()

2024-02-29 00:00:00     0
2024-02-29 00:00:30     0
2024-02-29 00:01:00     1
2024-02-29 00:01:30     1
2024-02-29 00:02:00     2
                       ..
2024-02-29 00:57:00    57
2024-02-29 00:57:30    57
2024-02-29 00:58:00    58
2024-02-29 00:58:30    58
2024-02-29 00:59:00    59
Freq: 30s, Length: 119, dtype: int32

In [26]:
# back filling은 다음 데이터를 가져와 사용하는 방법 - `bfill()`
ts.resample('30s').bfill()

2024-02-29 00:00:00     0
2024-02-29 00:00:30     1
2024-02-29 00:01:00     1
2024-02-29 00:01:30     2
2024-02-29 00:02:00     2
                       ..
2024-02-29 00:57:00    57
2024-02-29 00:57:30    58
2024-02-29 00:58:00    58
2024-02-29 00:58:30    59
2024-02-29 00:59:00    59
Freq: 30s, Length: 119, dtype: int32

### `dt` 접근자
`datetime`의 데이터 타입 시리즈는 `dt`접근자를 사용하여 `datetime`데이터 타입이 가지는 여러가지 속성 및 메서드를 사용할 수 있음

In [27]:
# 출력칸의 타입 확인하기.
s = pd.Series(pd.date_range('2024-01-01', periods=10))
s   

0   2024-01-01
1   2024-01-02
2   2024-01-03
3   2024-01-04
4   2024-01-05
5   2024-01-06
6   2024-01-07
7   2024-01-08
8   2024-01-09
9   2024-01-10
dtype: datetime64[ns]

`year`, `month`, `day` 와 같은 속성을 사용할 수 있음

In [28]:
s.dt.day

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int32

In [29]:
# 시리즈.dt.year    타입은 정수형이다.
s.dt.year

0    2024
1    2024
2    2024
3    2024
4    2024
5    2024
6    2024
7    2024
8    2024
9    2024
dtype: int32

`strftime()`메서드를 사용하여 `datetime` 데이터 타입의 데이터를 문자열 데이터로 변경 할 수 있음

In [30]:
s.dt.strftime('%Y. %m. %d. %H:%M:%s')

0    2024-01-01 00:00:00
1    2024-01-02 00:00:00
2    2024-01-03 00:00:00
3    2024-01-04 00:00:00
4    2024-01-05 00:00:00
5    2024-01-06 00:00:00
6    2024-01-07 00:00:00
7    2024-01-08 00:00:00
8    2024-01-09 00:00:00
9    2024-01-10 00:00:00
dtype: object

<aside>
<img src="https://noticon-static.tammolo.com/dgggcrkxq/image/upload/v1574221064/noticon/xjal9z4a8h46soi6ktgo.png" alt="https://noticon-static.tammolo.com/dgggcrkxq/image/upload/v1574221064/noticon/xjal9z4a8h46soi6ktgo.png" width="40px" /> **파이썬으로 다음 연산을 수행한다.**

다음 명령으로 만들어진 데이터프레임에 대해 월별 value의 합계를 구하라. (힌트: `groupby` 메서드와 `dt` 접근자를 사용하라)

</aside>

In [31]:
np.random.seed(0)
df = pd.DataFrame({
    "date": pd.date_range("2020-12-25", periods=100, freq="D"),
    "value": np.random.randint(100, size=(100,))
})
df

Unnamed: 0,date,value
0,2020-12-25,44
1,2020-12-26,47
2,2020-12-27,64
3,2020-12-28,67
4,2020-12-29,67
...,...,...
95,2021-03-30,23
96,2021-03-31,79
97,2021-04-01,13
98,2021-04-02,85


In [32]:
df.date = df.date.dt.month
df

Unnamed: 0,date,value
0,12,44
1,12,47
2,12,64
3,12,67
4,12,67
...,...,...
95,3,23
96,3,79
97,4,13
98,4,85


In [33]:
df.date = df.date.dt.month
df

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
df.groupby(df.date).sum()

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
1,1811
2,985
3,1500
4,146
12,381
