In [3]:
import yfinance as yf
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

### Excercise 2.2: 
On a series of E-mini S&P 500 futures tick data:

**(a)** Form tick, volume, and dollar bars. Use the ETF Trick to deal with the roll.

#### The Kaggle/Finnhub tick dataset was rejected due to missing data during core NYSE trading hours (9:30-16:00 ET), with volume concentrated in overnight sessions instead. Yahoo Finance 1-minute data provides complete coverage of regular trading hours with proper volume distribution.

In [61]:
data = yf.download("ES=F", start=datetime.now()-timedelta(days=7), 
                   end=datetime.now(), interval="1m", progress=False, multi_level_index=False)

  data = yf.download("ES=F", start=datetime.now()-timedelta(days=7),


In [62]:
data

Unnamed: 0_level_0,Close,High,Low,Open,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-19 22:00:00+00:00,6720.00,6726.00,6717.50,6721.00,0
2025-10-19 22:01:00+00:00,6718.50,6721.50,6718.50,6720.75,269
2025-10-19 22:02:00+00:00,6717.50,6721.00,6716.50,6718.25,435
2025-10-19 22:03:00+00:00,6716.50,6719.00,6715.50,6717.75,291
2025-10-19 22:04:00+00:00,6715.75,6718.00,6715.50,6716.50,281
...,...,...,...,...,...
2025-10-24 14:48:00+00:00,6830.25,6831.25,6829.75,6831.00,2337
2025-10-24 14:49:00+00:00,6830.25,6830.75,6828.50,6830.25,2161
2025-10-24 14:50:00+00:00,6829.50,6830.75,6828.25,6830.50,2908
2025-10-24 14:51:00+00:00,6828.00,6829.75,6827.00,6829.50,3408


#### ------------------Tick Bar------------------

In [89]:

print("Index name:", data.index.name)
print("Columns:", data.columns.tolist())
print("\nFirst row:")
print(data.head(1))

def create_tick_bars(data, tick_threshold=100):
    """
    Create tick bars
    data: standard data in yfinance format
    tick_treshold: how much "ticks" should equal one tick bar

    """
    tick_bars = []
    

    if 'Datetime' in data.columns:
        time_col = 'Datetime'
        use_index = False
    elif data.index.name == 'Datetime' or isinstance(data.index, pd.DatetimeIndex):
        use_index = True
    else:
        data = data.reset_index()
        time_col = data.columns[0]
        use_index = False
    
    for i in range(0, len(data), tick_threshold):
        bar_data = data.iloc[i:i+tick_threshold]
        
        if len(bar_data) == 0:
            continue
        
        tick_bar = {
            'datetime': bar_data.index[-1] if use_index else bar_data[time_col].iloc[-1],
            'open': bar_data['Open'].iloc[0],
            'high': bar_data['High'].max(),
            'low': bar_data['Low'].min(),
            'close': bar_data['Close'].iloc[-1],
            'volume': bar_data['Volume'].sum(),
            'num_ticks': len(bar_data)
        }
        tick_bars.append(tick_bar)
    
    return pd.DataFrame(tick_bars)

tick_bars = create_tick_bars(data, tick_threshold=100)

print(f"\nCreated {len(tick_bars)} tick bars")
print(f"Each tick bar = 100 1-min bars\n")
print(tick_bars.head())


Index name: Datetime
Columns: ['Close', 'High', 'Low', 'Open', 'Volume']

First row:
                            Close    High     Low    Open  Volume
Datetime                                                         
2025-10-19 22:00:00+00:00  6720.0  6726.0  6717.5  6721.0       0

Created 65 tick bars
Each tick bar = 100 1-min bars

                   datetime     open     high      low    close  volume  \
0 2025-10-19 23:39:00+00:00  6721.00  6726.00  6702.00  6704.25   16419   
1 2025-10-20 01:19:00+00:00  6704.75  6719.00  6694.00  6718.75   23652   
2 2025-10-20 02:59:00+00:00  6718.75  6720.75  6709.25  6715.00   12889   
3 2025-10-20 04:40:00+00:00  6715.00  6728.50  6715.00  6722.75    9103   
4 2025-10-20 06:20:00+00:00  6722.75  6732.00  6722.25  6725.75    9925   

   num_ticks  
0        100  
1        100  
2        100  
3        100  
4        100  


#### ------------------Volume Bar------------------

In [90]:
def create_volume_bars(data, volume_threshold=100000):
    """
    Create volume bars according to AFML methodology
    Each bar contains fixed amount of volume
    """
    volume_bars = []
    

    use_index = isinstance(data.index, pd.DatetimeIndex)
    
    # Initialize accumulators
    volume_sum = 0
    open_price = None
    high_price = float('-inf')
    low_price = float('inf')
    close_price = None
    start_time = None
    end_time = None
    
    for i in range(len(data)):
        row = data.iloc[i]
        
        # Get timestamp
        timestamp = row.name if use_index else row['Datetime']
        
        # First tick in new bar
        if volume_sum == 0:
            open_price = float(row['Open'])
            start_time = timestamp
        
        # Accumulate OHLC
        high_price = max(high_price, float(row['High']))
        low_price = min(low_price, float(row['Low']))
        close_price = float(row['Close'])
        end_time = timestamp
        
      
        volume_sum += float(row['Volume'])
        
        # Check if threshold reached
        if volume_sum >= volume_threshold:
            volume_bar = {
                'datetime': end_time,
                'open': open_price,
                'high': high_price,
                'low': low_price,
                'close': close_price,
                'volume': volume_sum
            }
            volume_bars.append(volume_bar)
            
            # Reset for next bar
            volume_sum = 0
            open_price = None
            high_price = float('-inf')
            low_price = float('inf')
            close_price = None
    
    # Handle last partial bar (optional)
    if volume_sum > 0:
        volume_bar = {
            'datetime': end_time,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume_sum
        }
        volume_bars.append(volume_bar)
    
    return pd.DataFrame(volume_bars)

volume_bars = create_volume_bars(data, volume_threshold=5000)

print(f"Created {len(volume_bars)} volume bars")
print(f"Each bar has ~{50000:,} volume\n")
print(volume_bars.head(10))
print("\nVolume stats:")
print(volume_bars['volume'].describe())


Created 886 volume bars
Each bar has ~50,000 volume

                   datetime     open     high      low    close  volume
0 2025-10-19 22:20:00+00:00  6720.75  6726.00  6702.50  6716.50  5043.0
1 2025-10-19 23:00:00+00:00  6716.50  6717.75  6712.75  6715.25  5101.0
2 2025-10-19 23:28:00+00:00  6715.25  6715.75  6702.00  6702.25  5163.0
3 2025-10-20 00:03:00+00:00  6702.75  6706.75  6701.50  6703.25  5060.0
4 2025-10-20 00:24:00+00:00  6703.50  6704.00  6694.00  6700.25  5845.0
5 2025-10-20 00:40:00+00:00  6700.25  6706.25  6696.75  6704.75  5365.0
6 2025-10-20 00:58:00+00:00  6704.75  6716.25  6703.25  6715.75  5077.0
7 2025-10-20 01:30:00+00:00  6716.00  6720.75  6713.75  6716.75  5095.0
8 2025-10-20 01:54:00+00:00  6716.50  6718.00  6710.00  6711.00  5049.0
9 2025-10-20 02:49:00+00:00  6711.00  6715.75  6709.25  6715.50  5129.0

Volume stats:
count      886.000000
mean      6866.381490
std       3137.043773
min       3408.000000
25%       5350.750000
50%       6144.500000
75%     

In [91]:
data.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,6460.0,6460.0,6460.0,6460.0,6460.0
mean,6761.861571,6762.584172,6761.097175,6761.85743,941.735913
std,26.338885,26.211969,26.449403,26.323001,1935.024522
min,6692.0,6695.75,6690.75,6692.25,0.0
25%,6739.75,6740.5,6738.9375,6739.75,78.0
50%,6771.0,6771.5,6770.25,6771.0,172.0
75%,6780.5,6781.0,6780.0,6780.5,1107.25
max,6835.25,6837.25,6833.75,6835.25,48685.0


#### ------------------Dollar Bar------------------

In [93]:
def create_dollar_bars(data, dollar_threshold=10000000):
    """
    Create dollar bars according to AFML methodology
    Each bar contains fixed amount of dollar value (volume × price)
    """
    dollar_bars = []
    
    # Check if Datetime is index or column
    use_index = isinstance(data.index, pd.DatetimeIndex)
    
    # Initialize accumulators
    dollar_sum = 0 # dollar instead of volume
    open_price = None
    high_price = float('-inf')
    low_price = float('inf')
    close_price = None
    volume_sum = 0
    end_time = None
    
    for i in range(len(data)):
        row = data.iloc[i]
        
        # Get timestamp
        timestamp = row.name if use_index else row['Datetime']
        
        # First tick in new bar
        if dollar_sum == 0:
            open_price = float(row['Open'])
        
        # Accumulate OHLC
        high_price = max(high_price, float(row['High']))
        low_price = min(low_price, float(row['Low']))
        close_price = float(row['Close'])
        end_time = timestamp
        
        # Accumulate volume
        volume_sum += float(row['Volume'])
        
        # Dollar value = Volume × Price 
        dollar_value = float(row['Volume']) * float(row['Close'])
        dollar_sum += dollar_value
        
        # Check if threshold reached
        if dollar_sum >= dollar_threshold:
            dollar_bar = {
                'datetime': end_time,
                'open': open_price,
                'high': high_price,
                'low': low_price,
                'close': close_price,
                'volume': volume_sum,
                'dollar_value': dollar_sum 
            }
            dollar_bars.append(dollar_bar)
            
            # Reset for next bar
            dollar_sum = 0
            volume_sum = 0
            open_price = None
            high_price = float('-inf')
            low_price = float('inf')
    
    # Handle last partial bar
    if dollar_sum > 0:
        dollar_bar = {
            'datetime': end_time,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'volume': volume_sum,
            'dollar_value': dollar_sum
        }
        dollar_bars.append(dollar_bar)
    
    return pd.DataFrame(dollar_bars)

# Użycie
dollar_bars = create_dollar_bars(data, dollar_threshold=100_000_000)  # $10M per bar

print(f"Created {len(dollar_bars)} dollar bars")
print(f"Each bar has ~${10_000_000:,} traded\n")
print(dollar_bars.head(10))
print("\nDollar value stats:")
print(dollar_bars['dollar_value'].describe())


Created 358 dollar bars
Each bar has ~$10,000,000 traded

                   datetime     open     high      low    close   volume  \
0 2025-10-19 23:27:00+00:00  6720.75  6726.00  6702.25  6704.00  15012.0   
1 2025-10-20 00:37:00+00:00  6704.25  6706.75  6694.00  6700.50  15137.0   
2 2025-10-20 01:44:00+00:00  6700.25  6720.75  6700.25  6717.75  15016.0   
3 2025-10-20 04:17:00+00:00  6717.75  6727.25  6709.25  6724.75  14896.0   
4 2025-10-20 06:45:00+00:00  6724.75  6732.00  6722.00  6726.25  15201.0   
5 2025-10-20 07:54:00+00:00  6726.50  6737.50  6724.75  6731.00  14931.0   
6 2025-10-20 09:10:00+00:00  6731.00  6732.50  6717.25  6721.25  15004.0   
7 2025-10-20 10:45:00+00:00  6721.75  6727.00  6718.25  6718.50  15094.0   
8 2025-10-20 11:46:00+00:00  6718.50  6722.25  6711.50  6719.25  14943.0   
9 2025-10-20 12:46:00+00:00  6719.25  6731.75  6718.50  6731.50  16778.0   

   dollar_value  
0  1.007975e+08  
1  1.014327e+08  
2  1.008085e+08  
3  1.000538e+08  
4  1.022600e+08

**(b)** Count the number of bars produced by tick, volume, and dollar bars on a weekly basis. Plot a time series of that bar count. What bar type produces the most stable weekly count? Why?

**(c)** Compute the serial correlation of returns for the three bar types. What bar method has the lowest serial correlation?