In [6]:
%reset -f

import numpy as np
import numba as  nb
import pandas as pd

all = pd.read_parquet('tmp.parq').sort_values(by=['t'])
all['mid'] = (all['bid'].values + all['ask'].values)/2

# df_plt = bid_ask.copy()[bid_ask.t > np.datetime64('2023-06-22 10:00:00')]
# m = 15
# fig = Figure(layout=go.Layout(height=500, margin={'t':m,'b':m,'l':m,'r':m}))
# fig.add_traces(Scatter(x=df_plt['t'], y=df_plt['ask'], name='ask', mode="markers", marker={'size':2},showlegend=False))
# fig.add_traces(Scatter(x=df_plt['t'], y=df_plt['bid'], name='bid', mode="markers", marker={'size':2},showlegend=False))
# fig.add_traces(Scatter(x=df_plt['t'], y=(df_plt['bid'] + df_plt['ask'])/2, name='mid', mode="markers", marker={'size':2, 'color':'black'},showlegend=False))
# fig.show()
subset = all[-100000:]



In [7]:
@nb.jit(nopython=True)
def insert(a:np.array, idx:np.int64, v:np.int64) -> np.array:
    N = len(a)
    M = N + 1

    if idx < 0:
        if np.abs(idx) > M:
            raise Exception(f"Insertion index out of range {idx}.")
    else:
        if np.abs(idx) > N:
            raise Exception(f"Insertion index out of range {idx}.")

    # standard python behaviour. allows insertions from rear    
    insertion_point = idx % M

    tgt = np.full(M, v)
        
    for i in range (0, insertion_point):
        tgt[i] = a[i]
    tgt[insertion_point] = v
    for i in range (insertion_point, N):
        tgt[i+1] = a[i]

    return tgt


In [8]:

@nb.jit(nopython=True)
def numba_binner(t:np.array, val:np.array, bin_start:np.array, bin_end:np.array, ) -> np.array:
    ## figure out some metrics about the data being binned
    n_ticks, n_bins = len(t), len(np.unique(bin_start))
    bin_size = bin_end[0] - bin_start[0]

    #
    # create target arrays
    #
    bin_start_index = np.full(n_bins, t[0])
    bin_end_index = np.full(n_bins, t[0])
    n_samples = np.full(n_bins, 0)
    o = np.full(n_bins, 0.0)
    h = np.full(n_bins, 0.0)
    l = np.full(n_bins, 0.0)
    c = np.full(n_bins, 0.0)
    twap = np.full(n_bins, 0.0)
    t0 = np.full(n_bins, t[0])
    t1 = np.full(n_bins, t[0])

    # do the binning. j is leading index, i is traling index, bi is the bin index
    i, j, bi = 0, 0, 0
    while j < n_ticks:
        while (j < n_ticks) & (bin_start[i] == bin_start[j]):
            j += 1

        # j has overshot the bin, so the bin index is on the prev bin
        bin_start_index[bi] = bin_start[j - 1]
        bin_end_index[bi]   = bin_end[j - 1]

        n_samples[bi] = j - i

        times_bin = t[i:j]
        ticks_bin = val[i:j]

        o[bi] = ticks_bin[0]
        h[bi] = np.max(ticks_bin)
        l[bi] = np.min(ticks_bin)
        c[bi] = ticks_bin[-1]
        t0[bi] = times_bin[0]
        t1[bi] = times_bin[-1]

        # #####
        # for some calcs, forward fill from prev bin and to end of bin
        # #####
        if bi > 0:
            #  if there is a prev bin and if the previous bin exactly preceding in time
            # forward fill the last tick to the bin boundary
            if bin_start_index[bi] - bin_start_index[bi - 1] == bin_size:
                times_bin = insert(times_bin, 0, bin_start_index[bi])
                ticks_bin = insert(ticks_bin, 0, val[i-1])

        # forward fill last tick to bin boundary
        times_bin = insert(times_bin, -1, bin_end_index[bi])

        dt = np.diff(times_bin)
        twap[bi] = np.sum(dt * ticks_bin)/np.sum(dt)

        bi += 1
        i = j
            
    return bin_start_index, bin_end_index, t0, t1, n_samples, o, h, l, c, twap

def bin_values(ticks:pd.DataFrame,
        bin_size_sec:int,
        value_col:str,
        time_col:str = "t",
        ) -> pd.DataFrame:
    '''
    Prepare data for binning with numba. 
     - create bin indices for each tick (start/end)
     - 
    '''
    bin_start = ticks[time_col].values.astype(f'datetime64[{bin_size_sec}s]').astype('datetime64[ns]')
    bin_end = (bin_start + np.timedelta64(bin_size_sec, 's')).astype(np.int64)
    bin_start  = bin_start.astype(np.int64)

    tt = ticks.t.values.astype(np.int64)
    values = ticks[value_col].values
    bin_start_index, bin_end_index, t0, t1, n_samples, o, h, l, c, twap =  \
        numba_binner(tt, values, bin_start, bin_end)
    return pd.DataFrame({'bin_start':bin_start_index.astype('datetime64[ns]'),
                         'bin_end':bin_end_index.astype('datetime64[ns]'),
                         't0':t0.astype('datetime64[ns]'),
                         't1':t1.astype('datetime64[ns]'),
                         'n_samples':n_samples,
                         'o':o, 'h':h, 'l':l, 'c':c, 'twap':twap})

res = bin_values(subset, 60, time_col='t', value_col='mid')
res

Unnamed: 0,bin_start,bin_end,t0,t1,n_samples,o,h,l,c,twap
0,2023-07-03 11:01:00,2023-07-03 11:02:00,2023-07-03 11:01:45.339,2023-07-03 11:01:57.551,11,1.163545,1.163665,1.163545,1.163625,1.163624
1,2023-07-03 11:02:00,2023-07-03 11:03:00,2023-07-03 11:02:00.076,2023-07-03 11:02:59.717,37,1.163615,1.163615,1.163505,1.163505,1.163549
2,2023-07-03 11:03:00,2023-07-03 11:04:00,2023-07-03 11:03:00.659,2023-07-03 11:03:58.459,38,1.163495,1.163525,1.163435,1.163505,1.163487
3,2023-07-03 11:04:00,2023-07-03 11:05:00,2023-07-03 11:04:01.039,2023-07-03 11:04:56.726,30,1.163485,1.163515,1.163415,1.163415,1.163460
4,2023-07-03 11:05:00,2023-07-03 11:06:00,2023-07-03 11:05:01.720,2023-07-03 11:05:57.879,39,1.163415,1.163415,1.163275,1.163275,1.163316
...,...,...,...,...,...,...,...,...,...,...
3926,2023-07-06 06:31:00,2023-07-06 06:32:00,2023-07-06 06:31:00.462,2023-07-06 06:31:59.236,38,1.169975,1.170115,1.169975,1.170045,1.170047
3927,2023-07-06 06:32:00,2023-07-06 06:33:00,2023-07-06 06:32:00.204,2023-07-06 06:32:59.047,44,1.170075,1.170185,1.170045,1.170165,1.170115
3928,2023-07-06 06:33:00,2023-07-06 06:34:00,2023-07-06 06:33:00.042,2023-07-06 06:33:58.482,46,1.170175,1.170195,1.170065,1.170145,1.170125
3929,2023-07-06 06:34:00,2023-07-06 06:35:00,2023-07-06 06:34:01.194,2023-07-06 06:34:58.902,37,1.170155,1.170285,1.170145,1.170285,1.170201


In [9]:
res = bin_values(all, 60, value_col='mid')

subset = res.copy()[res.bin_start > np.datetime64('2023-07-06 06:00:00')]


In [10]:
subset

Unnamed: 0,bin_start,bin_end,t0,t1,n_samples,o,h,l,c,twap
20553,2023-07-06 06:01:00,2023-07-06 06:02:00,2023-07-06 06:01:00.674,2023-07-06 06:01:59.250,38,1.171205,1.171315,1.171195,1.171275,1.171244
20554,2023-07-06 06:02:00,2023-07-06 06:03:00,2023-07-06 06:02:00.108,2023-07-06 06:02:59.542,50,1.171275,1.171275,1.171105,1.171165,1.17121
20555,2023-07-06 06:03:00,2023-07-06 06:04:00,2023-07-06 06:03:01.402,2023-07-06 06:03:59.233,47,1.171175,1.171175,1.170785,1.170875,1.170934
20556,2023-07-06 06:04:00,2023-07-06 06:05:00,2023-07-06 06:04:02.214,2023-07-06 06:04:59.771,46,1.170895,1.171045,1.170885,1.170915,1.170954
20557,2023-07-06 06:05:00,2023-07-06 06:06:00,2023-07-06 06:05:01.041,2023-07-06 06:05:47.518,35,1.170925,1.170965,1.170545,1.170545,1.170717
20558,2023-07-06 06:07:00,2023-07-06 06:08:00,2023-07-06 06:07:05.556,2023-07-06 06:07:59.751,47,1.170555,1.170595,1.170455,1.170595,1.170517
20559,2023-07-06 06:08:00,2023-07-06 06:09:00,2023-07-06 06:08:01.088,2023-07-06 06:08:58.622,34,1.170575,1.170615,1.170355,1.170355,1.170473
20560,2023-07-06 06:09:00,2023-07-06 06:10:00,2023-07-06 06:09:00.639,2023-07-06 06:09:58.396,38,1.170365,1.170485,1.170365,1.170435,1.17044
20561,2023-07-06 06:10:00,2023-07-06 06:11:00,2023-07-06 06:10:01.042,2023-07-06 06:10:58.762,41,1.170455,1.170575,1.170375,1.170555,1.170461
20562,2023-07-06 06:11:00,2023-07-06 06:12:00,2023-07-06 06:11:01.038,2023-07-06 06:11:58.515,32,1.170535,1.170655,1.170535,1.170645,1.170565


In [23]:
res[res.bin_start == np.datetime64('2023-06-12 18:35:00')]

Unnamed: 0,bin_start,bin_end,t0,t1,n_samples,o,h,l,c,twap
89,2023-06-12 18:35:00,2023-06-12 18:36:00,2023-06-12 18:35:01.416,2023-06-12 18:35:56.716,26,1.162775,1.162835,1.162775,1.162815,1.16281
