### 2.2 On a series of E-mini S&P 500 futures tick data, compute dollar bars and dollar imbalance bars. What bar type exhibits greater serial autocorrelation? Why?

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
# testing tick data i found - IVE (S&P 500 Value Index)
df = pd.read_csv("../../Data/IVE_tickbidask.txt", header=None, names=['date', 'time', 'price', 'bid', 'ask', 'volume'])

df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
df.drop(['date', 'time'], axis=1).set_index('datetime')

print(df.describe())

              price           bid           ask        volume  \
count  1.208708e+07  1.208708e+07  1.208708e+07  1.208708e+07   
mean   1.126140e+02  1.126031e+02  1.126235e+02  2.432015e+02   
min    1.100000e-01  0.000000e+00  0.000000e+00  0.000000e+00   
25%    8.919000e+01  8.918000e+01  8.920000e+01  1.000000e+02   
50%    1.076450e+02  1.076400e+02  1.076500e+02  1.000000e+02   
75%    1.413300e+02  1.413200e+02  1.413400e+02  2.000000e+02   
max    2.118300e+02  2.116600e+02  6.048500e+02  5.157948e+06   
std    3.625035e+01  3.624854e+01  3.625447e+01  4.544144e+03   

                            datetime  
count                       12087076  
mean   2018-05-09 15:25:33.436868608  
min              2009-09-28 09:30:00  
25%              2015-08-21 15:57:40  
50%       2018-10-11 14:46:11.500000  
75%              2021-07-06 12:18:37  
max              2025-10-29 16:41:00  
std                              NaN  


In [4]:
def dollar_bar(threshold, df):
    agg_dict = {
        'price': ['first', 'max', 'min', 'last'],
        'volume': 'sum',
        'datetime': 'last'
    }

    dollar_bar = df.groupby((df['price'] * df['volume']).cumsum() // threshold).agg(agg_dict)
    dollar_bar.columns = ['open', 'high', 'low', 'close', 'volume', 'datetime']
    dollar_bar.set_index('datetime', inplace=True)

    return dollar_bar

dollar_bars = dollar_bar(1000000, df)
dollar_bars

Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-09-28 09:46:34,50.790,51.070,50.710,51.0700,19144
2009-09-28 09:52:41,51.070,51.148,51.050,51.1246,19924
2009-09-28 09:54:36,51.130,51.150,51.129,51.1290,19651
2009-09-28 09:54:43,51.129,51.140,51.129,51.1400,18657
2009-09-28 10:01:06,51.140,51.260,51.080,51.2600,20434
...,...,...,...,...,...
2025-10-29 15:59:59,209.310,209.320,209.300,209.3200,4652
2025-10-29 15:59:59,209.320,209.320,209.320,209.3200,4900
2025-10-29 15:59:59,209.320,209.340,209.310,209.3400,4800
2025-10-29 15:59:59,209.340,209.340,209.340,209.3400,1900


In [12]:
def dollar_imbalance_bar(threshold, df):

    df['price_change'] = df['price'].diff()
    df['tick_imbalance'] = np.sign(df['price_change'])
    df.dropna(inplace=True)
    df['tick_imbalance'] = df['tick_imbalance'].replace(0, np.nan)
    df['tick_imbalance'] = df['tick_imbalance'].ffill()
    df['dollar_imbalance'] = df['tick_imbalance'] * df['price'] * df['volume']
    print(df.head())

    # Przygotowanie zmiennych
    current_imbalance = 0
    bar_indices = []
    bar_prices = []
    bar_volumes = []
    final_bars = []

    # Lecimy przez każdy wiersz (tqdm doda pasek postępu, jeśli masz, jak nie to olej)
    for i, imb, c_pr, vol in zip(df.index, df['dollar_imbalance'].values, df['price'].values, df['volume'].values):
    
        # KROK 1: Dodaj aktualną nierównowagę do licznika
        current_imbalance += imb

        bar_prices.append(c_pr)
        bar_volumes.append(vol)

        # KROK 2: Sprawdź, czy przekroczyliśmy próg
        # Wewnątrz pętli for, w warunku zamknięcia świecy:
        if abs(current_imbalance) > threshold:
    
            # Krok 1: Konwersja list na tablice NumPy dla szybkiej agregacji
            np_prices = np.array(bar_prices)
            np_volumes = np.array(bar_volumes)
    
            # Krok 2: Zapisanie wyniku do słownika i listy finalnej
            final_bars.append({
                'datetime': i,
                'Open': bar_prices[0],
                'High': np.max(np_prices),
                'Low': np.min(np_prices),
                'Close': bar_prices[-1],
                'Volume': np.sum(np_volumes) # Uzupełnij funkcje NumPy
            })
    
            # Krok 3: Reset liczników i list (już masz)
            current_imbalance = 0
            bar_prices = []
            bar_volumes = []

    dollar_imb_bar = pd.DataFrame(final_bars)
    dollar_imb_bar.set_index('datetime', inplace=True)

    return dollar_imb_bar

dollar_imb_bars = dollar_imbalance_bar(10_000_000, df)
dollar_imb_bars

         date      time  price    bid    ask  volume            datetime  \
1  09/28/2009  09:30:00  50.71  50.70  50.79     638 2009-09-28 09:30:00   
2  09/28/2009  09:31:32  50.75  50.75  50.76     100 2009-09-28 09:31:32   
3  09/28/2009  09:31:32  50.75  50.75  50.76     100 2009-09-28 09:31:32   
4  09/28/2009  09:31:33  50.75  50.75  50.76     100 2009-09-28 09:31:33   
5  09/28/2009  09:31:33  50.75  50.75  50.76     100 2009-09-28 09:31:33   

   price_change  tick_imbalance  dollar_imbalance  
1         -0.08            -1.0         -32352.98  
2          0.04             1.0           5075.00  
3          0.00             1.0           5075.00  
4          0.00             1.0           5075.00  
5          0.00             1.0           5075.00  


Unnamed: 0_level_0,Open,High,Low,Close,Volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27387,50.7100,52.7440,49.1900,52.7218,6485321
59794,52.7100,53.0822,49.1800,49.8100,7189396
64146,49.8110,50.0800,48.9300,49.4200,1385930
75735,49.4200,52.3500,49.2300,52.3300,2988676
98959,52.2600,53.2100,51.4200,53.0580,5767149
...,...,...,...,...,...
12073739,206.1100,208.3200,205.9400,208.2999,235094
12080120,208.2800,209.5700,207.1823,208.9800,1286414
12082297,208.9800,211.1900,208.5100,210.9123,545154
12083896,210.9215,211.8300,210.6910,211.0700,399240


In [35]:
def create_dollar_imbalance_bars(data, expected_T=100, ewma_window=100, warmup_bars=20):
    bars = []
    
    use_index = isinstance(data.index, pd.DatetimeIndex)
    
    # Akumulatory
    theta = 0
    prev_price = None
    prev_b = 1
    tick_count = 0
    
    # EWMA tracking z warm-up
    ewma_theta = []
    bars_created = 0  # licznik barów
    
    # Bar accumulators
    open_price = None
    high_price = float('-inf')
    low_price = float('inf')
    close_price = None
    volume_sum = 0
    end_time = None
    
    # Oblicz initial E_theta z pierwszych N ticków
    initial_sample = min(1000, len(data) // 10)
    initial_bvs = []
    temp_price = None
    temp_b = 1
    
    for i in range(initial_sample):
        row = data.iloc[i]
        price = float(row['Close'])
        volume = float(row['Volume'])
        
        if temp_price is None:
            b_t = 1
        elif price > temp_price:
            b_t = 1
        elif price < temp_price:
            b_t = -1
        else:
            b_t = temp_b
        
        initial_bvs.append(abs(b_t * volume * price))
        temp_price = price
        temp_b = b_t
    
    # Initial E_theta z sample
    E_theta_init = np.mean(initial_bvs) if initial_bvs else 100000
    
    for i in range(len(data)):
        row = data.iloc[i]
        timestamp = row.name if use_index else row['Datetime']
        
        price = float(row['Close'])
        volume = float(row['Volume'])
        
        if tick_count == 0:
            open_price = float(row['Open'])
        
        high_price = max(high_price, float(row['High']))
        low_price = min(low_price, float(row['Low']))
        close_price = price
        volume_sum += volume
        end_time = timestamp
        
        # Tick rule
        if prev_price is None:
            b_t = 1
        elif price > prev_price:
            b_t = 1
        elif price < prev_price:
            b_t = -1
        else:
            b_t = prev_b
        
        signed_dollar = b_t * volume * price
        theta += signed_dollar
        tick_count += 1
        
        # Adaptive threshold z warm-up protection
        if bars_created < warmup_bars:
            # Podczas warm-up używaj initial estimate
            E_theta = E_theta_init
        else:
            # Po warm-up używaj EWMA
            E_theta = np.mean(np.abs(ewma_theta[-ewma_window:])) if len(ewma_theta) > ewma_window//2 else E_theta_init
        
        threshold = expected_T * E_theta
        
        if abs(theta) >= threshold:
            bar = {
                'datetime': end_time,
                'open': open_price,
                'high': high_price,
                'low': low_price,
                'close': close_price,
                'volume': volume_sum,
                'imbalance': theta,
                'tick_count': tick_count
            }
            bars.append(bar)
            
            # Store dla EWMA
            ewma_theta.append(theta)
            bars_created += 1
            
            # Reset
            theta = 0
            tick_count = 0
            open_price = None
            high_price = float('-inf')
            low_price = float('inf')
            volume_sum = 0
            
        prev_price = price
        prev_b = b_t
    
    # Last partial bar
    if tick_count > 0:
        bar = {
            'datetime': end_time,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume_sum,
            'imbalance': theta,
            'tick_count': tick_count
        }
        bars.append(bar)
    
    return pd.DataFrame(bars)




dollar_imb_bars = create_dollar_imbalance_bars(data, expected_T=50, ewma_window=100, warmup_bars=20)  

print(dollar_imb_bars)
print(dollar_imb_bars.describe())


                    datetime     open     high      low    close     volume  \
0  2025-10-20 07:11:00+00:00  6718.75  6732.75  6694.00  6732.50    79020.0   
1  2025-10-20 13:00:00+00:00  6732.50  6737.50  6711.50  6734.25    81553.0   
2  2025-10-20 13:30:00+00:00  6734.50  6742.50  6732.00  6742.25    39237.0   
3  2025-10-20 13:31:00+00:00  6742.25  6744.50  6741.75  6744.00     8758.0   
4  2025-10-20 13:33:00+00:00  6744.25  6748.50  6743.00  6745.25    12034.0   
5  2025-10-20 13:35:00+00:00  6745.25  6748.25  6743.50  6748.00     9520.0   
6  2025-10-20 13:43:00+00:00  6748.00  6751.50  6743.75  6749.75    35899.0   
7  2025-10-20 13:47:00+00:00  6749.75  6753.25  6746.00  6753.00    15893.0   
8  2025-10-20 14:09:00+00:00  6753.00  6757.50  6744.75  6757.50    77353.0   
9  2025-10-20 14:20:00+00:00  6757.50  6758.25  6750.50  6752.50    30553.0   
10 2025-10-20 14:24:00+00:00  6752.75  6758.25  6751.50  6757.75     9161.0   
11 2025-10-20 14:27:00+00:00  6757.75  6763.50  6757

In [37]:
def analyze_serial_correlation(bars, bar_name):

    bars["returns"] = np.log(bars["close"] / bars["close"].shift(1))

    bars = bars.dropna(subset=['returns'])

    lag1_corr = bars['returns'].autocorr(lag=1)

    lags = range(1, 21)
    autocorrs = [bars['returns'].autocorr(lag=i) for i in lags]

    print(f"\n{bar_name}:")
    print(f"  Lag-1 autocorrelation: {lag1_corr:.6f}")
    print(f"  Mean return: {bars['returns'].mean():.6f}")
    print(f"  Std return: {bars['returns'].std():.6f}")
    
    return lag1_corr, autocorrs

dollar_corr, dollar_acf = analyze_serial_correlation(dollar_bars.copy(), "Dollar Bars")
dollar_imb_corr, dollar_imb_acf = analyze_serial_correlation(dollar_imb_bars.copy(), "Dollar Imbalance Bars")

print(f"\nDollar bars: |autocorr| = {abs(dollar_corr):.6f}")
print(f"Dollar imbalance bars: |autocorr| = {abs(dollar_imb_corr):.6f}")
print(f"Improvement: {(abs(dollar_corr) - abs(dollar_imb_corr)):.6f}")



Dollar Bars:
  Lag-1 autocorrelation: -0.003928
  Mean return: 0.000313
  Std return: 0.001793

Dollar Imbalance Bars:
  Lag-1 autocorrelation: -0.096981
  Mean return: 0.000273
  Std return: 0.001331

Dollar bars: |autocorr| = 0.003928
Dollar imbalance bars: |autocorr| = 0.096981
Improvement: -0.093053


In [38]:
# next step - try with tick data because 1 min olhc data is bad for theese excercises