In [2]:
from pathlib import Path
import pyarrow.parquet as pq

instr = "US10"
base_dir = Path("/mnt/nas/parquet_tick_data")
instr_dir = base_dir / instr

contract = "20250600"

path = instr_dir / f"{contract}_primary.parquet"


pf = pq.ParquetFile(str(path))

# The total row (line) count is in the metadata
num_rows = pf.metadata.num_rows
print(f"Total rows in {path.name!r}: {num_rows}")


Total rows in '20250600_primary.parquet': 1657551


In [None]:
import datetime
from datetime import timezone, timedelta
from pathlib import Path

import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pandas as pd

# -----------------------------------------------------------------------------
# PARAMETERS: adjust these to match your environment
# -----------------------------------------------------------------------------
PARQUET_PATH = "/mnt/nas/parquet_tick_data"  # root of your new storage
instr        = "US10"
contract     = "20250900"

# -----------------------------------------------------------------------------
# Compute “last hour” window in UTC
# -----------------------------------------------------------------------------
now_utc      = datetime.datetime.now(timezone.utc)
one_hour_ago = now_utc - timedelta(hours=1)

# Convert to Arrow timestamp scalars (ns, UTC)
start_ts = pa.scalar(int(one_hour_ago.timestamp() * 1e9), type=pa.timestamp("ns", tz="UTC"))
end_ts   = pa.scalar(int(now_utc.timestamp()   * 1e9), type=pa.timestamp("ns", tz="UTC"))

# -----------------------------------------------------------------------------
# Build the path to the contract folder under the new layout:
#   PARQUET_PATH/{instrument_code}/contract={contract}/
# -----------------------------------------------------------------------------
base_dir = Path(PARQUET_PATH) / instr / f"contract={contract}"
if not base_dir.exists():
    raise FileNotFoundError(f"Contract folder not found: {base_dir}")

# -----------------------------------------------------------------------------
# 1) Create a Dataset over the contract directory, using Hive partitioning
#    so that subfolders named "trading_date=YYYYMMDD" are recognized.
# -----------------------------------------------------------------------------
dataset = ds.dataset(
    str(base_dir),
    format="parquet",
    partitioning="hive"
)

# -----------------------------------------------------------------------------
# 2) Build a filter expression on the "timestamp" column (ns, UTC).
# -----------------------------------------------------------------------------
time_filter = (
    (pc.field("timestamp") >= start_ts) &
    (pc.field("timestamp") <  end_ts)
)

# -----------------------------------------------------------------------------
# 3) Read only the row groups whose "timestamp" falls in [one_hour_ago, now).
# -----------------------------------------------------------------------------
filtered_table = dataset.to_table(filter=time_filter)

# -----------------------------------------------------------------------------
# 4) Convert to pandas DataFrame and print all rows + column count
# -----------------------------------------------------------------------------
df_last_hour = filtered_table.to_pandas()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
pd.set_option("display.expand_frame_repr", False)

print(f"\n=== Ticks for {instr}/{contract} from {one_hour_ago.isoformat()} to {now_utc.isoformat()} ===\n")
if df_last_hour.empty:
    print("No data in the last hour.")
else:
    print(df_last_hour)
    print(f"\nTotal rows in last hour: {len(df_last_hour)}")

# Reset pandas display options if needed
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.width")
pd.reset_option("display.expand_frame_repr")


                            timestamp   bid_price   ask_price  bid_size  ask_size  trading_date
0    2025-06-06 18:08:51.122116+00:00  109.984375  110.000000      2413      4122      20250606
1    2025-06-06 18:08:51.192545+00:00  109.984375  110.000000      3910      3119      20250606
2    2025-06-06 18:08:51.246076+00:00  109.984375  110.000000      3258      3390      20250606
3    2025-06-06 18:08:51.300734+00:00  109.984375  110.000000      1597      4466      20250606
4    2025-06-06 18:08:51.309298+00:00  109.968750  109.984375      4324      1473      20250606
5    2025-06-06 18:08:51.361911+00:00  109.968750  109.984375      4317      1532      20250606
6    2025-06-06 18:08:51.779647+00:00  109.968750  109.984375      4318      1531      20250606
7    2025-06-06 18:08:52.188090+00:00  109.968750  109.984375      4318      1531      20250606
8    2025-06-06 18:08:52.188585+00:00  109.968750  109.984375      4318      1531      20250606
9    2025-06-06 18:08:52.858162+00:00  1