In [2]:
from pathlib import Path
import pyarrow.parquet as pq

instr = "US10"
base_dir = Path("/mnt/nas/parquet_tick_data")
instr_dir = base_dir / instr

contract = "20250600"

path = instr_dir / f"{contract}_primary.parquet"


pf = pq.ParquetFile(str(path))

# The total row (line) count is in the metadata
num_rows = pf.metadata.num_rows
print(f"Total rows in {path.name!r}: {num_rows}")


Total rows in '20250600_primary.parquet': 1657551


In [6]:
import datetime
from datetime import timezone, timedelta
from pathlib import Path

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pandas as pd

# -----------------------------------------------------------------------------
# PARAMETERS: adjust these to match your environment
# -----------------------------------------------------------------------------
PARQUET_PATH = "/mnt/nas/parquet_tick_data"  # root of your storage
instr        = "US10"
contract     = "20250900"

# -----------------------------------------------------------------------------
# Compute “last hour” window in UTC
# -----------------------------------------------------------------------------
now_utc      = datetime.datetime.now(timezone.utc)
one_hour_ago = now_utc - timedelta(hours=1)

# Convert to Arrow timestamp scalars (ns, UTC)
start_ts = pa.scalar(int(one_hour_ago.timestamp() * 1e9), type=pa.timestamp("ns", tz="UTC"))
end_ts   = pa.scalar(int(now_utc.timestamp()   * 1e9), type=pa.timestamp("ns", tz="UTC"))

# -----------------------------------------------------------------------------
# Build path to today’s single‐file Parquet:
#   PARQUET_PATH/{instr}/contract={contract}/{YYYYMMDD}.parquet
# -----------------------------------------------------------------------------
trading_date_str = now_utc.strftime("%Y%m%d")
daily_file = Path(PARQUET_PATH) / instr / f"contract={contract}" / f"{trading_date_str}.parquet"

if not daily_file.exists():
    raise FileNotFoundError(f"No Parquet file found for today’s date: {daily_file}")

# -----------------------------------------------------------------------------
# 1) Create a Dataset over that single Parquet file (no hive partitions needed)
# -----------------------------------------------------------------------------
dataset = ds.dataset(str(daily_file), format="parquet")

# -----------------------------------------------------------------------------
# 2) Build a filter on the "timestamp" column
# -----------------------------------------------------------------------------
time_filter = (
    (pc.field("timestamp") >= start_ts) &
    (pc.field("timestamp") <  end_ts)
)

# -----------------------------------------------------------------------------
# 3) Read only rows in [one_hour_ago, now)
# -----------------------------------------------------------------------------
filtered_table = dataset.to_table(filter=time_filter)

# -----------------------------------------------------------------------------
# 4) Convert to pandas and print all rows and the count
# -----------------------------------------------------------------------------
df_last_hour = filtered_table.to_pandas()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
pd.set_option("display.expand_frame_repr", False)

print(f"\n=== Ticks for {instr}/{contract} on {trading_date_str} from {one_hour_ago.isoformat()} to {now_utc.isoformat()} ===\n")
if df_last_hour.empty:
    print("No data in the last hour.")
else:
    print(df_last_hour)
    print(f"\nTotal rows in last hour: {len(df_last_hour)}")

# Reset pandas display options if desired
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.width")
pd.reset_option("display.expand_frame_repr")



=== Ticks for US10/20250900 on 20250606 from 2025-06-06T18:37:33.349874+00:00 to 2025-06-06T19:37:33.349874+00:00 ===

                           timestamp   bid_price   ask_price  bid_size  ask_size
0   2025-06-06 19:34:03.320617+00:00  109.906250  109.921875      3900      2127
1   2025-06-06 19:34:03.369443+00:00  109.906250  109.921875      3491      2813
2   2025-06-06 19:34:03.413444+00:00  109.906250  109.921875      3619      3310
3   2025-06-06 19:34:03.543109+00:00  109.921875  109.937500       672      5269
4   2025-06-06 19:34:03.971947+00:00  109.921875  109.937500      1886      4851
5   2025-06-06 19:34:04.122326+00:00  109.921875  109.937500      1886      4851
6   2025-06-06 19:34:04.997424+00:00  109.921875  109.937500      1851      4877
7   2025-06-06 19:34:05.071598+00:00  109.921875  109.937500      1851      4877
8   2025-06-06 19:34:05.071961+00:00  109.921875  109.937500      1851      4877
9   2025-06-06 19:34:05.555030+00:00  109.921875  109.937500      1851

In [3]:
import pandas as pd
import pyarrow.parquet as pq

# path="/mnt/nas/price_data/futures_contract_prices/raw_data/US3/daily/20250900.parquet"
path="/mnt/nas/parquet_tick_data/US3/contract=20250900/20250618.parquet"
# path="/mnt/nas/parquet_tick_data/US10/contract=20250900/20250618.parquet"



pd.read_parquet(path)

Unnamed: 0,timestamp,bid_price,ask_price,bid_size,ask_size
0,2025-06-18 22:05:10.189367+00:00,105.597656,105.613281,5,3
1,2025-06-18 22:05:11.117303+00:00,105.597656,105.609375,5,2
2,2025-06-18 22:05:24.908273+00:00,105.597656,105.609375,1,2
3,2025-06-18 22:15:50.475978+00:00,105.605469,105.617188,5,1
4,2025-06-18 22:16:26.631977+00:00,105.605469,105.617188,6,1
...,...,...,...,...,...
505,2025-06-18 23:56:19.534196+00:00,105.601562,105.609375,2,5
506,2025-06-18 23:56:20.614300+00:00,105.601562,105.609375,3,5
507,2025-06-18 23:58:23.854375+00:00,105.601562,105.609375,1,5
508,2025-06-18 23:58:37.194848+00:00,105.601562,105.609375,2,5
