### Set Up

In [6]:
# Add the src directory to Python path so we can import from src modules
import sys
sys.path.append('../src')

import logging
logger = logging.getLogger("transit_data_pipeline")
logger.propagate = False        # stop forwarding to the root handler
logger.setLevel(logging.INFO)   # pick the level you want

In [2]:
import os
import json
import time
from ingest.main import run

In [3]:
# Run existing ingest code to populate dataframe

os.environ["PROJECT_ID"] = "regal-dynamo-470908-v9"
os.environ["BQ_DATASET"] = "auckland_data_dev"
os.environ["BUCKET"] = "auckland-data-dev"

# Encode headers as base64 (matching Terraform pattern)
import base64
_headers_dict = {
  'Ocp-Apim-Subscription-Key': '1159c79486524360b17501ad888ee7d6'
}
os.environ["HEADERS"] = base64.b64encode(json.dumps(_headers_dict).encode("utf-8")).decode("utf-8")

os.environ["URL"] = 'https://api.at.govt.nz/realtime/legacy/vehiclelocations'
os.environ["DATASET"] = 'vehicle-positions'
os.environ["SPEC"] = 'rt'
os.environ["RESPONSE_TYPE"] = 'json'

project_id = "regal-dynamo-470908-v9"
dataset = "auckland_data_dev"
table_name = 'rt_vehicle_positions'
table_id = f"{project_id}.{dataset}.{table_name}"

from ingest.main import run
df = run({})

INFO - Starting data ingestion for vehicle-positions...
INFO - Fetched and processed data for vehicle-positions, size: 562152 bytes.
INFO - Fetched and processed data for vehicle-positions, size: 562152 bytes.
INFO - Uploaded real-time data to GCS: rt-vehicle-positions/year=2025/month=10/day=20/hour=21/vehicle-positions-20251020T211641Z.json.gz
INFO - Uploaded real-time data to GCS: rt-vehicle-positions/year=2025/month=10/day=20/hour=21/vehicle-positions-20251020T211641Z.json.gz
INFO - Added 2 missing schema fields: ['created_at', 'updated_at'] for schema VehiclePositions
INFO - Added 2 missing schema fields: ['created_at', 'updated_at'] for schema VehiclePositions
INFO - Transformed data into DataFrame with 1671 records.
INFO - Using deduplication mode 'skip_duplicates' for dataset 'vehicle-positions'
INFO - Stopped prior to BigQuery upload
INFO - Transformed data into DataFrame with 1671 records.
INFO - Using deduplication mode 'skip_duplicates' for dataset 'vehicle-positions'
INFO -

### Test

In [9]:
from big_query.methods.batch_method import upload as batch_upload
#from big_query.methods.streaming_method import upload as streaming_upload
from big_query.methods.method_treaming_redis_snapshot import (
    upload as redis_snapshot_upload,
)


In [10]:
import time

_start = time.perf_counter()

batch_upload(df, project_id, dataset, table_name)

elapsed = time.perf_counter() - _start
print(f"Batch upload took {elapsed:.2f} seconds")


Batch upload took 7.56 seconds


In [11]:
import time

_start = time.perf_counter()

streaming_upload(
    df,
    project_id,
    dataset,
    table_name,
    use_snapshot=True,
    snapshot_bucket="auckland-data-dev",
    snapshot_object="snapshot/rt_vehicle_position.csv",
)

elapsed = time.perf_counter() - _start
print(f"Streaming upload took {elapsed:.2f} seconds")

NameError: name 'streaming_upload' is not defined

### Redis Snapshot Streaming
Use the Redis-backed snapshot flow to test deduplication without touching the legacy helper. Populate the connection details from your Memorystore instance or the outputs generated during `terraform apply`.

In [12]:
import os
import json
from pathlib import Path
from urllib.parse import quote

redis_host = os.getenv("REDIS_HOST")
redis_port = int(os.getenv("REDIS_PORT", "6379"))
redis_auth = os.getenv("REDIS_AUTH")
redis_use_tls = os.getenv("REDIS_USE_TLS", "false").lower() in {"true", "1", "yes"}

# Allow notebook-local files to supply connection info when env vars are absent
base_dir = Path.cwd()
details_path = base_dir / "redis_details.json"
auth_path = base_dir / "redis_auth_string.txt"

if (not redis_host or redis_host == "") and details_path.exists():
    with details_path.open("r", encoding="utf-8") as fh:
        details = json.load(fh)
    redis_host = details.get("host", redis_host)
    redis_port = int(details.get("port", redis_port))
    # Memorystore defaults to TLS disabled unless configured
    redis_use_tls = bool(details.get("use_tls", redis_use_tls))

if (not redis_auth or redis_auth == "") and auth_path.exists():
    raw_auth = auth_path.read_text(encoding="utf-8").strip()
    if raw_auth:
        try:
            redis_auth = json.loads(raw_auth)
        except json.JSONDecodeError:
            redis_auth = raw_auth

if not redis_host:
    raise RuntimeError(
        "Set REDIS_HOST (or ensure redis_details.json is available) before running the Redis snapshot test."
    )

auth_fragment = f":{quote(redis_auth)}@" if redis_auth else ""
scheme = "rediss" if redis_use_tls else "redis"
redis_url = f"{scheme}://{auth_fragment}{redis_host}:{redis_port}"

redis_snapshot_key = os.getenv(
    "REDIS_SNAPSHOT_KEY", f"{project_id}:{dataset}:{table_name}:snapshot"
)
ttl_env = os.getenv("REDIS_SNAPSHOT_TTL_SECONDS")
snapshot_ttl_seconds = int(ttl_env) if ttl_env else None

redis_config = {
    "redis_url": redis_url,
    "redis_snapshot_key": redis_snapshot_key,
    "snapshot_ttl_seconds": snapshot_ttl_seconds,
}
redis_config


{'redis_url': 'redis://:cdcad05e-27a1-49c3-bdfb-5d32418bda08@10.146.65.252:6378',
 'redis_snapshot_key': 'regal-dynamo-470908-v9:auckland_data_dev:rt_vehicle_positions:snapshot',
 'snapshot_ttl_seconds': None}

In [13]:
import time
from big_query.methods.method_treaming_redis_snapshot import (
    upload as redis_snapshot_upload,
)

_start = time.perf_counter()

redis_result = redis_snapshot_upload(
    df,
    project_id,
    dataset,
    table_name,
    use_snapshot=True,
    redis_url=redis_config["redis_url"],
    redis_key=redis_config["redis_snapshot_key"],
    snapshot_ttl_seconds=redis_config["snapshot_ttl_seconds"],
)

elapsed = time.perf_counter() - _start
print(f"Redis snapshot streaming took {elapsed:.2f} seconds")
redis_result


ConnectionError: Error 65 connecting to 10.146.65.252:6378. No route to host.

### Snapshot Profiling
Below helper runs the snapshot-enabled streaming pipeline step by step on a sample batch and records how long each stage takes. Adjust the sample slice before rerunning if you want to limit duplicate inserts.

In [12]:
import time
from collections import OrderedDict
import pandas as pd

from big_query.methods import streaming_method
from google.cloud import bigquery

SNAPSHOT_BUCKET = "auckland-data-dev"
SNAPSHOT_OBJECT = "snapshot/rt_vehicle_position.csv"


def profile_snapshot_stream(
    batch_df: pd.DataFrame,
    project: str,
    dataset_name: str,
    table: str,
    *,
    snapshot_bucket: str,
    snapshot_object: str,
    key_columns=None,
):
    """Return per-stage timings for the snapshot-enabled streaming upload."""
    timings = OrderedDict()
    stats = {}

    key_columns = list(key_columns or streaming_method.DEFAULT_SNAPSHOT_KEYS)
    table_id = f"{project}.{dataset_name}.{table}"
    snapshot_uri = f"gs://{snapshot_bucket}/{snapshot_object}"

    client = bigquery.Client(project=project)
    bq_table = client.get_table(table_id)

    start = time.perf_counter()
    schema_map = {field.name: field for field in bq_table.schema}
    timings["prepare_snapshot"] = time.perf_counter() - start

    start = time.perf_counter()
    snapshot_keys = streaming_method._load_snapshot_keys(
        snapshot_bucket, snapshot_object, key_columns, schema_map
    )
    timings["load_snapshot_keys"] = time.perf_counter() - start
    stats["snapshot_key_count"] = len(snapshot_keys)

    working_df = batch_df.copy()

    start = time.perf_counter()
    deduped_df, skipped = streaming_method._drop_seen_rows(
        working_df, snapshot_keys, key_columns
    )
    timings["drop_seen_rows"] = time.perf_counter() - start
    stats["skipped_duplicates"] = skipped
    stats["rows_after_dedupe"] = len(deduped_df)

    start = time.perf_counter()
    rows_payload = streaming_method._dataframe_to_rows(deduped_df)
    timings["serialize_rows"] = time.perf_counter() - start

    start = time.perf_counter()
    insert_errors = []
    if rows_payload:
        insert_errors = client.insert_rows(bq_table, rows_payload)
    timings["insert_rows"] = time.perf_counter() - start
    stats["insert_error_count"] = len(insert_errors)

    start = time.perf_counter()
    streaming_method._overwrite_snapshot(
        snapshot_bucket, snapshot_object, working_df, key_columns
    )
    timings["write_snapshot"] = time.perf_counter() - start

    return {
        "timings": timings,
        "stats": stats,
        "errors": insert_errors,
        "table": table_id,
        "snapshot_uri": snapshot_uri,
    }


# Use all rows by default; adjust sample if you want to limit duplicate inserts
profiling_result = profile_snapshot_stream(
    df,
    project_id,
    dataset,
    table_name,
    snapshot_bucket=SNAPSHOT_BUCKET,
    snapshot_object=SNAPSHOT_OBJECT,
)
profiling_result

  key_frame = source_df[list(key_columns)].copy().applymap(_normalize_key_value).drop_duplicates()
  payload = key_frame.applymap(_serialize_snapshot_value)


{'timings': OrderedDict([('prepare_snapshot', 0.00018445804016664624),
              ('load_snapshot_keys', 2.604979290976189),
              ('drop_seen_rows', 0.00961237499723211),
              ('serialize_rows', 0.10402949998388067),
              ('insert_rows', 0.6582888749544509),
              ('write_snapshot', 2.467181458021514)]),
 'stats': {'snapshot_key_count': 1916,
  'skipped_duplicates': 111,
  'rows_after_dedupe': 1802,
  'insert_error_count': 0},
 'errors': [],
 'table': 'regal-dynamo-470908-v9.auckland_data_dev.rt_vehicle_positions',
 'snapshot_uri': 'gs://auckland-data-dev/snapshot/rt_vehicle_position.csv'}