## Метод resample. Передискретизация и преобразование частоты

Под передискретизацией понимается процесс изменения частоты временного ряда. Агрегирование с переходом от высокой частоты к низкой называется понижающей передискретизацией, а переход от низкой частоты к более высокой – повышающей передискретизацией. Не  всякая передискретизация попадает в одну из этих категорий; например, преобразование частоты W-WED (еженедельно по средам) в W-FRI (еженедельно по пятницам) не повышает и не понижает частоту. Все объекты pandas имеют метод resample, отвечающий за любые преобразования частоты. API метода resample примерно такой же, как у groupby; мы сначала вызываем resample для группировки данных, а затем обращаемся к функции агрегирования:

In [1]:
import pandas as pd
import numpy as np

In [2]:
dates = pd.date_range('2024-01-01', periods=100)
ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)
ts

2024-01-01   -0.397263
2024-01-02    0.861141
2024-01-03   -0.705830
2024-01-04   -0.878197
2024-01-05    1.160445
                ...   
2024-04-05    1.220566
2024-04-06   -0.627581
2024-04-07    0.513091
2024-04-08    1.773999
2024-04-09   -0.899180
Freq: D, Length: 100, dtype: float64

In [3]:
ts.resample('ME').mean()

2024-01-31   -0.090703
2024-02-29   -0.218678
2024-03-31   -0.216898
2024-04-30   -0.153874
Freq: ME, dtype: float64

In [4]:
index = pd.date_range('1/1/2024', periods=9, freq='min')
series = pd.Series(range(9), index=index)
series

2024-01-01 00:00:00    0
2024-01-01 00:01:00    1
2024-01-01 00:02:00    2
2024-01-01 00:03:00    3
2024-01-01 00:04:00    4
2024-01-01 00:05:00    5
2024-01-01 00:06:00    6
2024-01-01 00:07:00    7
2024-01-01 00:08:00    8
Freq: min, dtype: int64

In [5]:
series.resample('3min').sum()

2024-01-01 00:00:00     3
2024-01-01 00:03:00    12
2024-01-01 00:06:00    21
Freq: 3min, dtype: int64

In [6]:
series.resample('3min', label='right').sum()

2024-01-01 00:03:00     3
2024-01-01 00:06:00    12
2024-01-01 00:09:00    21
Freq: 3min, dtype: int64

In [7]:
series.resample('3min', label='right', closed='right').sum()

2024-01-01 00:00:00     0
2024-01-01 00:03:00     6
2024-01-01 00:06:00    15
2024-01-01 00:09:00    15
Freq: 3min, dtype: int64

In [8]:
series.resample('30s').asfreq()[0:5]   # выбираем первые 5 строк

2024-01-01 00:00:00    0.0
2024-01-01 00:00:30    NaN
2024-01-01 00:01:00    1.0
2024-01-01 00:01:30    NaN
2024-01-01 00:02:00    2.0
Freq: 30s, dtype: float64

In [9]:
series.resample('30s').ffill()[0:5]

2024-01-01 00:00:00    0
2024-01-01 00:00:30    0
2024-01-01 00:01:00    1
2024-01-01 00:01:30    1
2024-01-01 00:02:00    2
Freq: 30s, dtype: int64

In [10]:
series.resample('30s').bfill()[0:5]

2024-01-01 00:00:00    0
2024-01-01 00:00:30    1
2024-01-01 00:01:00    1
2024-01-01 00:01:30    2
2024-01-01 00:02:00    2
Freq: 30s, dtype: int64

In [11]:
def custom_resampler(arraylike):
    return np.sum(arraylike) + 5

series.resample('3min').apply(custom_resampler)

2024-01-01 00:00:00     8
2024-01-01 00:03:00    17
2024-01-01 00:06:00    26
Freq: 3min, dtype: int64

In [12]:
d = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
     'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
df = pd.DataFrame(d)
df['week_starting'] = pd.date_range('01/01/2024', periods=8, freq='W')
df

Unnamed: 0,price,volume,week_starting
0,10,50,2024-01-07
1,11,60,2024-01-14
2,9,40,2024-01-21
3,13,100,2024-01-28
4,14,50,2024-02-04
5,18,100,2024-02-11
6,17,40,2024-02-18
7,19,50,2024-02-25


In [13]:
df.resample('ME', on='week_starting').mean()

Unnamed: 0_level_0,price,volume
week_starting,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-31,10.75,62.5
2024-02-29,17.0,60.0


In [14]:
days = pd.date_range('1/1/2024', periods=4, freq='D')
d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19],
      'volume': [50, 60, 40, 100, 50, 100, 40, 50]}
df2 = pd.DataFrame(d2, index=pd.MultiIndex.from_product([days, ['morning', 'afternoon']]))
df2

Unnamed: 0,Unnamed: 1,price,volume
2024-01-01,morning,10,50
2024-01-01,afternoon,11,60
2024-01-02,morning,9,40
2024-01-02,afternoon,13,100
2024-01-03,morning,14,50
2024-01-03,afternoon,18,100
2024-01-04,morning,17,40
2024-01-04,afternoon,19,50


In [15]:
df2.resample('D', level=0).sum()

Unnamed: 0,price,volume
2024-01-01,21,110
2024-01-02,22,140
2024-01-03,32,150
2024-01-04,36,90


In [16]:
start, end = '2024-10-01 23:30:00', '2024-10-02 00:30:00'
rng = pd.date_range(start, end, freq='7min')
ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
ts

2024-10-01 23:30:00     0
2024-10-01 23:37:00     3
2024-10-01 23:44:00     6
2024-10-01 23:51:00     9
2024-10-01 23:58:00    12
2024-10-02 00:05:00    15
2024-10-02 00:12:00    18
2024-10-02 00:19:00    21
2024-10-02 00:26:00    24
Freq: 7min, dtype: int32

In [17]:
ts.resample('17min').sum()

2024-10-01 23:14:00     0
2024-10-01 23:31:00     9
2024-10-01 23:48:00    21
2024-10-02 00:05:00    54
2024-10-02 00:22:00    24
Freq: 17min, dtype: int32

In [18]:
ts.resample('17min', origin='epoch').sum()

2024-10-01 23:22:00     3
2024-10-01 23:39:00    15
2024-10-01 23:56:00    45
2024-10-02 00:13:00    45
Freq: 17min, dtype: int32

In [19]:
ts.resample('17min', origin='2024-01-01').sum()

2024-10-01 23:24:00     3
2024-10-01 23:41:00    15
2024-10-01 23:58:00    45
2024-10-02 00:15:00    45
Freq: 17min, dtype: int32

In [20]:
ts.resample('17min', origin='start').sum()

2024-10-01 23:30:00     9
2024-10-01 23:47:00    21
2024-10-02 00:04:00    54
2024-10-02 00:21:00    24
Freq: 17min, dtype: int32

In [21]:
ts.resample('17min', offset='23h30min').sum()

2024-10-01 23:30:00     9
2024-10-01 23:47:00    21
2024-10-02 00:04:00    54
2024-10-02 00:21:00    24
Freq: 17min, dtype: int32

In [22]:
ts.resample('17min', origin='end').sum()

2024-10-01 23:35:00     0
2024-10-01 23:52:00    18
2024-10-02 00:09:00    27
2024-10-02 00:26:00    63
Freq: 17min, dtype: int32

In [23]:
ts.resample('17min', origin='end_day').sum()

2024-10-01 23:38:00     3
2024-10-01 23:55:00    15
2024-10-02 00:12:00    45
2024-10-02 00:29:00    45
Freq: 17min, dtype: int32

## Объект pd.Grouper. Группировка и агрегирование данных временных рядов

Помимо метода resample мы можем использовать специальный объект pd.Grouper для группировки и агрегации данных временных рядов.

Рассмотрим пример данных о продажах и некоторые простые операции для получения общих продаж по месяцам, дням, годам и т.д.

In [24]:
df = pd.read_excel('files/sample-salesv3.xlsx')
# Столбец date приведем к типу datetime
df['date'] = pd.to_datetime(df['date'])
df.head(10)

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date
0,740150,Barton LLC,B1-20000,39,86.69,3380.91,2014-01-01 07:21:51
1,714466,Trantow-Barrows,S2-77896,-1,63.16,-63.16,2014-01-01 10:00:47
2,218895,Kulas Inc,B1-69924,23,90.7,2086.1,2014-01-01 13:24:58
3,307599,"Kassulke, Ondricka and Metz",S1-65481,41,21.05,863.05,2014-01-01 15:05:22
4,412290,Jerde-Hilpert,S2-34077,6,83.21,499.26,2014-01-01 23:26:55
5,714466,Trantow-Barrows,S2-77896,17,87.63,1489.71,2014-01-02 10:07:15
6,218895,Kulas Inc,B1-65551,2,31.1,62.2,2014-01-02 10:57:23
7,729833,Koepp Ltd,S1-30248,8,33.25,266.0,2014-01-03 06:32:11
8,714466,Trantow-Barrows,S1-50961,22,84.09,1849.98,2014-01-03 11:29:02
9,737550,"Fritsch, Russel and Anderson",S2-82423,14,81.92,1146.88,2014-01-03 19:07:37


In [25]:
df.set_index('date').resample('ME')['ext price'].sum()

date
2014-01-31    185361.66
2014-02-28    146211.62
2014-03-31    203921.38
2014-04-30    174574.11
2014-05-31    165418.55
2014-06-30    174089.33
2014-07-31    191662.11
2014-08-31    153778.59
2014-09-30    168443.17
2014-10-31    171495.32
2014-11-30    119961.22
2014-12-31    163867.26
Freq: ME, Name: ext price, dtype: float64

In [26]:
df.set_index('date').groupby('name')['ext price'].resample('ME').sum()

name        date      
Barton LLC  2014-01-31     6177.57
            2014-02-28    12218.03
            2014-03-31     3513.53
            2014-04-30    11474.20
            2014-05-31    10220.17
                            ...   
Will LLC    2014-08-31     1439.82
            2014-09-30     4345.99
            2014-10-31     7085.33
            2014-11-30     3210.44
            2014-12-31    12561.21
Name: ext price, Length: 240, dtype: float64

In [27]:
df.groupby(['name', pd.Grouper(key='date', freq='ME')])['ext price'].sum()

name        date      
Barton LLC  2014-01-31     6177.57
            2014-02-28    12218.03
            2014-03-31     3513.53
            2014-04-30    11474.20
            2014-05-31    10220.17
                            ...   
Will LLC    2014-08-31     1439.82
            2014-09-30     4345.99
            2014-10-31     7085.33
            2014-11-30     3210.44
            2014-12-31    12561.21
Name: ext price, Length: 240, dtype: float64

In [28]:
df.groupby(['name', pd.Grouper(key='date', freq='YE-DEC')])['ext price'].sum()

name                             date      
Barton LLC                       2014-12-31    109438.50
Cronin, Oberbrunner and Spencer  2014-12-31     89734.55
Frami, Hills and Schmidt         2014-12-31    103569.59
Fritsch, Russel and Anderson     2014-12-31    112214.71
Halvorson, Crona and Champlin    2014-12-31     70004.36
Herman LLC                       2014-12-31     82865.00
Jerde-Hilpert                    2014-12-31    112591.43
Kassulke, Ondricka and Metz      2014-12-31     86451.07
Keeling LLC                      2014-12-31    100934.30
Kiehn-Spinka                     2014-12-31     99608.77
Koepp Ltd                        2014-12-31    103660.54
Kuhn-Gusikowski                  2014-12-31     91094.28
Kulas Inc                        2014-12-31    137351.96
Pollich LLC                      2014-12-31     87347.18
Purdy-Kunde                      2014-12-31     77898.21
Sanford and Sons                 2014-12-31     98822.98
Stokes LLC                       2014-12-31 