In [6]:
import pandas as pd
import datetime
from databento import DBNStore
import databento as db
import zstandard as zstd
import pandas as pd
import io
import pickle as pkl
import numpy as np
import time

In [8]:
# Define file paths
file_paths = [
    '/Users/jakesharadin/Desktop/blockhouse/data/xnas-itch-20241202.mbp-10.dbn.zst',
    '/Users/jakesharadin/Desktop/blockhouse/data/xnas-itch-20241203.mbp-10.dbn.zst',
    '/Users/jakesharadin/Desktop/blockhouse/data/xnas-itch-20241204.mbp-10.dbn.zst',
    '/Users/jakesharadin/Desktop/blockhouse/data/xnas-itch-20241205.mbp-10.dbn.zst'
]

# Initialize empty list to store dataframes
dfs = []

# Load and process each file with retry mechanism
for file_path in file_paths:
    print(f"\nProcessing {file_path.split('/')[-1]}...")
    max_retries = 3
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            ts_start = datetime.datetime.utcnow()
            data = DBNStore.from_file(path=file_path)
            temp_df = data.to_df()
            dfs.append(temp_df)
            print(f"Shape: {temp_df.shape}")
            print(f"Time taken: {datetime.datetime.utcnow() - ts_start}")
            break
        except TimeoutError:
            retry_count += 1
            print(f"Timeout occurred. Attempt {retry_count} of {max_retries}")
            time.sleep(5)  # Wait 5 seconds before retrying
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            break

# Concatenate all dataframes
if dfs:
    print("\nMerging dataframes...")
    ts_start = datetime.datetime.utcnow()
    df = pd.concat(dfs, axis=0, ignore_index=True)
    
    # Sort by timestamp
    df = df.sort_values('ts_event').reset_index(drop=True)
    
    print(f"\nFinal merged dataframe shape: {df.shape}")
    print(f"Time taken for merge: {datetime.datetime.utcnow() - ts_start}")
    
    # Save merged dataframe
    print("\nSaving merged dataframe...")
    df.to_pickle("/Users/jakesharadin/Desktop/xnas_itch_merged_dataframe.pkl")
    print("Done!")
    
    # Print some basic info about the merged data
    print("\nMerged data summary:")
    print(f"Date range: {df['ts_event'].min()} to {df['ts_event'].max()}")
    print("\nSample counts by day:")
    df['date'] = pd.to_datetime(df['ts_event']).dt.date
    print(df['date'].value_counts().sort_index())
else:
    print("No data was successfully loaded!")

# If loading fails, try to load from pickle if it exists
if 'df' not in locals():
    try:
        print("\nAttempting to load existing merged pickle file...")
        df = pd.read_pickle("/Users/jakesharadin/Desktop/xnas_itch_merged_dataframe.pkl")
        print("Successfully loaded merged dataframe from pickle file!")
        print(f"Shape: {df.shape}")
    except FileNotFoundError:
        print("No existing merged pickle file found!")


Processing xnas-itch-20241202.mbp-10.dbn.zst...
Timeout occurred. Attempt 1 of 3
Shape: (7361265, 73)
Time taken: 0:05:58.421583

Processing xnas-itch-20241203.mbp-10.dbn.zst...
Shape: (6555034, 73)
Time taken: 0:04:14.510654

Processing xnas-itch-20241204.mbp-10.dbn.zst...
Shape: (7113749, 73)
Time taken: 0:04:43.655144

Processing xnas-itch-20241205.mbp-10.dbn.zst...
Shape: (7390170, 73)
Time taken: 0:04:30.874042

Merging dataframes...

Final merged dataframe shape: (28420218, 73)
Time taken for merge: 0:04:14.998675

Saving merged dataframe...
Done!

Merged data summary:
Date range: 2024-12-02 09:00:00.006056124+00:00 to 2024-12-05 23:59:53.483826051+00:00

Sample counts by day:
date
2024-12-02    7361265
2024-12-03    6555034
2024-12-04    7113749
2024-12-05    7390170
Name: count, dtype: int64


In [9]:
df.head()

Unnamed: 0,ts_event,rtype,publisher_id,instrument_id,action,side,depth,price,size,flags,...,bid_ct_08,ask_ct_08,bid_px_09,ask_px_09,bid_sz_09,ask_sz_09,bid_ct_09,ask_ct_09,symbol,date
0,2024-12-02 09:00:00.006056124+00:00,10,2,12463,A,N,0,164.97,100,130,...,0,0,,,0,0,0,0,PEP,2024-12-02
1,2024-12-02 09:00:00.009936778+00:00,10,2,17709,A,N,0,117.18,100,130,...,0,0,,,0,0,0,0,XOM,2024-12-02
2,2024-12-02 09:00:00.009975632+00:00,10,2,17709,A,A,0,118.62,100,128,...,0,0,,,0,0,0,0,XOM,2024-12-02
3,2024-12-02 09:00:00.073883926+00:00,10,2,16244,A,N,0,960.0,25,128,...,0,0,,,0,0,0,0,TSLA,2024-12-02
4,2024-12-02 09:00:00.074013500+00:00,10,2,11667,A,N,0,132.11,500,130,...,0,0,,,0,0,0,0,NVDA,2024-12-02


In [10]:
df.shape

(28420218, 74)

In [14]:
# Save df to a pickle file
with open('df.pkl', 'wb') as f:
    pkl.dump(df, f)