In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

This library is intended to be used as an alternative to `pd.Series.rolling` and `pd.Series.expanding` to gain a speedup by using numba optimized functions operating on numpy arrays. There are also online classes for more efficient updates of window statistics.

## Install

`pip install window-ops`

## How to use

### Transformations

For a transformations `n_samples` -> `n_samples` you can use `[seasonal_](rolling|expanding)_(mean|max|min|std)` on an array.

#### Benchmarks

In [None]:
#| include: false
import random
import time
from functools import partial

import numpy as np
import pandas as pd

import window_ops
from window_ops.ewm import *
from window_ops.expanding import *
from window_ops.online import *
from window_ops.rolling import *

In [None]:
pd.__version__

'1.3.5'

In [None]:
n_samples = 10_000  # array size
window_size = 8  # for rolling operations
season_length = 7  # for seasonal operations
execute_times = 10 # number of times each function will be executed

In [None]:
#| include: false
np.random.seed(0)
random.seed(0)
y = np.random.rand(n_samples)
ys = pd.Series(y)
groups = np.arange(n_samples) % season_length
grouped_y = ys.groupby(groups)

In [None]:
#| include: false
times = {'window_ops': {}, 'pandas': {}}
ops = ('mean', 'max', 'min', 'std')
for kind in ('rolling', 'expanding', 'seasonal_rolling', 'seasonal_expanding'):
    for op in ops:
        functions = {}
        if kind == 'rolling':
            functions['window_ops'] = partial(getattr(window_ops.rolling, f'{kind}_{op}'), window_size=window_size)
            functions['pandas'] = lambda y: y.rolling(window_size, min_periods=window_size).agg(op)
        elif kind == 'expanding':
            functions['window_ops'] = getattr(window_ops.expanding, f'{kind}_{op}')
            functions['pandas'] = lambda y: y.expanding().agg(op)
        elif kind == 'seasonal_rolling':
            functions['window_ops'] = partial(getattr(window_ops.rolling, f'{kind}_{op}'), 
                                              season_length=season_length, window_size=window_size)
            functions['pandas'] = lambda grouped_y: grouped_y.transform(lambda x: x.rolling(window_size).agg(op))
        elif kind == 'seasonal_expanding':
            functions['window_ops'] = partial(getattr(window_ops.expanding, f'{kind}_{op}'), season_length=season_length)
            functions['pandas'] = lambda grouped_y: grouped_y.transform(lambda x: x.expanding().agg(op))
        else:
            raise ValueError(kind)
            
        results = {}            
        for impl in ('window_ops', 'pandas'):
            if impl == 'window_ops':
                x = y
            else:
                x = grouped_y if kind.startswith('seasonal') else ys
            func = functions[impl]
            results[impl] = func(x)
            start = time.perf_counter()
            for _ in range(execute_times):
                func(x)
            times[impl][f'{kind}_{op}'] = time.perf_counter() - start
        assert np.allclose(results['window_ops'], results['pandas'], equal_nan=True)
times = pd.DataFrame(times) * 1_000 / execute_times

Average times in milliseconds.

In [None]:
times.applymap('{:.2f}'.format)

Unnamed: 0,window_ops,pandas
rolling_mean,0.03,0.43
rolling_max,0.14,0.57
rolling_min,0.14,0.58
rolling_std,0.06,0.54
expanding_mean,0.03,0.31
expanding_max,0.05,0.76
expanding_min,0.05,0.47
expanding_std,0.09,0.41
seasonal_rolling_mean,0.05,3.89
seasonal_rolling_max,0.18,4.27


In [None]:
speedups = times['pandas'] / times['window_ops']
speedups = speedups.to_frame('times faster')
speedups.applymap('{:.0f}'.format)

Unnamed: 0,times faster
rolling_mean,15
rolling_max,4
rolling_min,4
rolling_std,9
expanding_mean,12
expanding_max,15
expanding_min,9
expanding_std,4
seasonal_rolling_mean,77
seasonal_rolling_max,23


### Online

If you have an array for which you want to compute a window statistic and then keep updating it as more samples come in you can use the classes in the `window_ops.online` module. They all have a `fit_transform` method which take the array and return the transformations defined above but also have an `update` method that take a single value and return the new statistic.

In [None]:
#| include: false
times = {}
ops = ('mean', 'max', 'min', 'std')
online_class = None
for kind in ('Rolling', 'Expanding', 'SeasonalRolling', 'SeasonalExpanding'):
    for op in ops:
        if kind == 'Rolling':
            online_class = getattr(window_ops.online, f'{kind}{op.title()}')(window_size=window_size)
        elif kind == 'Expanding':
            online_class = getattr(window_ops.online, f'{kind}{op.title()}')()
        elif kind == 'SeasonalRolling':
            online_class = getattr(window_ops.online, f'{kind}{op.title()}')(season_length=season_length, window_size=window_size)
        elif kind == 'SeasonalExpanding':
            online_class = getattr(window_ops.online, f'{kind}{op.title()}')(season_length=season_length)
        else:
            raise ValueError(kind)
        
        # compile
        online_class.fit_transform(y)
        
        start = time.perf_counter()
        for _ in range(execute_times):
            online_class.fit_transform(y)
            for i in range(100):
                online_class.update(i)
        times[f'{kind}{op.title()}'] = time.perf_counter() - start
times = pd.Series(times, name='average time (ms)') * 1_000 / execute_times

#### Benchmarks

Average time in milliseconds it takes to transform the array and perform 100 updates.

In [None]:
times.to_frame().applymap('{:.2f}'.format)

Unnamed: 0,average time (ms)
RollingMean,0.12
RollingMax,0.23
RollingMin,0.22
RollingStd,0.32
ExpandingMean,0.1
ExpandingMax,0.07
ExpandingMin,0.07
ExpandingStd,0.17
SeasonalRollingMean,0.28
SeasonalRollingMax,0.35
