### 2.2 On a series of E-mini S&P 500 futures tick data, compute dollar bars and dollar imbalance bars. What bar type exhibits greater serial autocorrelation? Why?

In [6]:
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime, timedelta

In [7]:
data = yf.download("ES=F", start=datetime.now()-timedelta(days=7), 
                   end=datetime.now(), interval="1m", progress=False, multi_level_index=False, auto_adjust=True)

In [8]:
data

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-19 22:06:00+00:00,6718.00,6719.00,6716.50,6718.75,0
2025-10-19 22:07:00+00:00,6717.75,6718.00,6716.75,6718.00,67
2025-10-19 22:08:00+00:00,6717.25,6718.25,6716.75,6718.00,223
2025-10-19 22:09:00+00:00,6718.75,6719.50,6717.50,6717.75,353
2025-10-19 22:10:00+00:00,6719.00,6719.00,6718.00,6718.75,247
...,...,...,...,...,...
2025-10-24 20:55:00+00:00,6825.75,6825.75,6825.50,6825.75,184
2025-10-24 20:56:00+00:00,6825.75,6826.00,6825.50,6825.50,190
2025-10-24 20:57:00+00:00,6825.25,6825.75,6825.25,6825.75,75
2025-10-24 20:58:00+00:00,6826.00,6826.00,6825.25,6825.50,264


In [31]:
def create_dollar_bars(data, dollar_threshold=10000000):
    """
    Create dollar bars according to AFML methodology
    Each bar contains fixed amount of dollar value (volume × price)
    """
    dollar_bars = []
    
    # Check if Datetime is index or column
    use_index = isinstance(data.index, pd.DatetimeIndex)
    
    # Initialize accumulators
    dollar_sum = 0 # dollar instead of volume
    open_price = None
    high_price = float('-inf')
    low_price = float('inf')
    close_price = None
    volume_sum = 0
    end_time = None
    
    for i in range(len(data)):
        row = data.iloc[i]
        
        # Get timestamp
        timestamp = row.name if use_index else row['Datetime']
        
        # First tick in new bar
        if dollar_sum == 0:
            open_price = float(row['Open'])
        
        # Accumulate OHLC
        high_price = max(high_price, float(row['High']))
        low_price = min(low_price, float(row['Low']))
        close_price = float(row['Close'])
        end_time = timestamp
        
        # Accumulate volume
        volume_sum += float(row['Volume'])
        
        # Dollar value = Volume × Price 
        dollar_value = float(row['Volume']) * float(row['Close'])
        dollar_sum += dollar_value
        
        # Check if threshold reached
        if dollar_sum >= dollar_threshold:
            dollar_bar = {
                'datetime': end_time,
                'open': open_price,
                'high': high_price,
                'low': low_price,
                'close': close_price,
                'volume': volume_sum,
                'dollar_value': dollar_sum 
            }
            dollar_bars.append(dollar_bar)
            
            # Reset for next bar
            dollar_sum = 0
            volume_sum = 0
            open_price = None
            high_price = float('-inf')
            low_price = float('inf')
    
    # Handle last partial bar
    if dollar_sum > 0:
        dollar_bar = {
            'datetime': end_time,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume_sum,
            'dollar_value': dollar_sum
        }
        dollar_bars.append(dollar_bar)
    
    return pd.DataFrame(dollar_bars)

dollar_bars = create_dollar_bars(data, dollar_threshold=1_000_000_000)  # $10M per bar

print(dollar_bars)
print(dollar_bars.describe())


                    datetime     open     high      low    close    volume  \
0  2025-10-20 12:46:00+00:00  6718.00  6737.50  6694.00  6731.50  150166.0   
1  2025-10-20 13:52:00+00:00  6731.25  6754.75  6729.75  6752.00  148801.0   
2  2025-10-20 14:40:00+00:00  6752.00  6767.25  6744.75  6766.00  150399.0   
3  2025-10-20 15:44:00+00:00  6766.25  6775.50  6763.25  6768.75  147960.0   
4  2025-10-20 16:54:00+00:00  6768.75  6774.00  6762.25  6769.75  147933.0   
5  2025-10-20 18:46:00+00:00  6769.75  6783.00  6769.00  6773.25  150233.0   
6  2025-10-20 19:58:00+00:00  6773.25  6782.50  6772.50  6773.50  152268.0   
7  2025-10-21 07:28:00+00:00  6773.50  6785.00  6764.25  6768.25  147659.0   
8  2025-10-21 13:43:00+00:00  6768.25  6777.75  6763.25  6770.25  149017.0   
9  2025-10-21 14:19:00+00:00  6770.25  6775.25  6759.25  6774.50  152677.0   
10 2025-10-21 14:57:00+00:00  6774.50  6784.50  6762.00  6783.50  148289.0   
11 2025-10-21 15:48:00+00:00  6783.50  6787.25  6773.75  6783.50

In [35]:
def create_dollar_imbalance_bars(data, expected_T=100, ewma_window=100, warmup_bars=20):
    bars = []
    
    use_index = isinstance(data.index, pd.DatetimeIndex)
    
    # Akumulatory
    theta = 0
    prev_price = None
    prev_b = 1
    tick_count = 0
    
    # EWMA tracking z warm-up
    ewma_theta = []
    bars_created = 0  # licznik barów
    
    # Bar accumulators
    open_price = None
    high_price = float('-inf')
    low_price = float('inf')
    close_price = None
    volume_sum = 0
    end_time = None
    
    # Oblicz initial E_theta z pierwszych N ticków
    initial_sample = min(1000, len(data) // 10)
    initial_bvs = []
    temp_price = None
    temp_b = 1
    
    for i in range(initial_sample):
        row = data.iloc[i]
        price = float(row['Close'])
        volume = float(row['Volume'])
        
        if temp_price is None:
            b_t = 1
        elif price > temp_price:
            b_t = 1
        elif price < temp_price:
            b_t = -1
        else:
            b_t = temp_b
        
        initial_bvs.append(abs(b_t * volume * price))
        temp_price = price
        temp_b = b_t
    
    # Initial E_theta z sample
    E_theta_init = np.mean(initial_bvs) if initial_bvs else 100000
    
    for i in range(len(data)):
        row = data.iloc[i]
        timestamp = row.name if use_index else row['Datetime']
        
        price = float(row['Close'])
        volume = float(row['Volume'])
        
        if tick_count == 0:
            open_price = float(row['Open'])
        
        high_price = max(high_price, float(row['High']))
        low_price = min(low_price, float(row['Low']))
        close_price = price
        volume_sum += volume
        end_time = timestamp
        
        # Tick rule
        if prev_price is None:
            b_t = 1
        elif price > prev_price:
            b_t = 1
        elif price < prev_price:
            b_t = -1
        else:
            b_t = prev_b
        
        signed_dollar = b_t * volume * price
        theta += signed_dollar
        tick_count += 1
        
        # Adaptive threshold z warm-up protection
        if bars_created < warmup_bars:
            # Podczas warm-up używaj initial estimate
            E_theta = E_theta_init
        else:
            # Po warm-up używaj EWMA
            E_theta = np.mean(np.abs(ewma_theta[-ewma_window:])) if len(ewma_theta) > ewma_window//2 else E_theta_init
        
        threshold = expected_T * E_theta
        
        if abs(theta) >= threshold:
            bar = {
                'datetime': end_time,
                'open': open_price,
                'high': high_price,
                'low': low_price,
                'close': close_price,
                'volume': volume_sum,
                'imbalance': theta,
                'tick_count': tick_count
            }
            bars.append(bar)
            
            # Store dla EWMA
            ewma_theta.append(theta)
            bars_created += 1
            
            # Reset
            theta = 0
            tick_count = 0
            open_price = None
            high_price = float('-inf')
            low_price = float('inf')
            volume_sum = 0
            
        prev_price = price
        prev_b = b_t
    
    # Last partial bar
    if tick_count > 0:
        bar = {
            'datetime': end_time,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume_sum,
            'imbalance': theta,
            'tick_count': tick_count
        }
        bars.append(bar)
    
    return pd.DataFrame(bars)




dollar_imb_bars = create_dollar_imbalance_bars(data, expected_T=50, ewma_window=100, warmup_bars=20)  

print(dollar_imb_bars)
print(dollar_imb_bars.describe())


                    datetime     open     high      low    close     volume  \
0  2025-10-20 07:11:00+00:00  6718.75  6732.75  6694.00  6732.50    79020.0   
1  2025-10-20 13:00:00+00:00  6732.50  6737.50  6711.50  6734.25    81553.0   
2  2025-10-20 13:30:00+00:00  6734.50  6742.50  6732.00  6742.25    39237.0   
3  2025-10-20 13:31:00+00:00  6742.25  6744.50  6741.75  6744.00     8758.0   
4  2025-10-20 13:33:00+00:00  6744.25  6748.50  6743.00  6745.25    12034.0   
5  2025-10-20 13:35:00+00:00  6745.25  6748.25  6743.50  6748.00     9520.0   
6  2025-10-20 13:43:00+00:00  6748.00  6751.50  6743.75  6749.75    35899.0   
7  2025-10-20 13:47:00+00:00  6749.75  6753.25  6746.00  6753.00    15893.0   
8  2025-10-20 14:09:00+00:00  6753.00  6757.50  6744.75  6757.50    77353.0   
9  2025-10-20 14:20:00+00:00  6757.50  6758.25  6750.50  6752.50    30553.0   
10 2025-10-20 14:24:00+00:00  6752.75  6758.25  6751.50  6757.75     9161.0   
11 2025-10-20 14:27:00+00:00  6757.75  6763.50  6757

In [37]:
def analyze_serial_correlation(bars, bar_name):

    bars["returns"] = np.log(bars["close"] / bars["close"].shift(1))

    bars = bars.dropna(subset=['returns'])

    lag1_corr = bars['returns'].autocorr(lag=1)

    lags = range(1, 21)
    autocorrs = [bars['returns'].autocorr(lag=i) for i in lags]

    print(f"\n{bar_name}:")
    print(f"  Lag-1 autocorrelation: {lag1_corr:.6f}")
    print(f"  Mean return: {bars['returns'].mean():.6f}")
    print(f"  Std return: {bars['returns'].std():.6f}")
    
    return lag1_corr, autocorrs

dollar_corr, dollar_acf = analyze_serial_correlation(dollar_bars.copy(), "Dollar Bars")
dollar_imb_corr, dollar_imb_acf = analyze_serial_correlation(dollar_imb_bars.copy(), "Dollar Imbalance Bars")

print(f"\nDollar bars: |autocorr| = {abs(dollar_corr):.6f}")
print(f"Dollar imbalance bars: |autocorr| = {abs(dollar_imb_corr):.6f}")
print(f"Improvement: {(abs(dollar_corr) - abs(dollar_imb_corr)):.6f}")



Dollar Bars:
  Lag-1 autocorrelation: -0.003928
  Mean return: 0.000313
  Std return: 0.001793

Dollar Imbalance Bars:
  Lag-1 autocorrelation: -0.096981
  Mean return: 0.000273
  Std return: 0.001331

Dollar bars: |autocorr| = 0.003928
Dollar imbalance bars: |autocorr| = 0.096981
Improvement: -0.093053


In [38]:
# next step - try with tick data because 1 min olhc data is bad for theese excercises