# Storage Write API Streaming Notebook
This notebook rebuilds the proven `bqwapi_upload.py` workflow from scratch so the full dataset can be streamed through the BigQuery Storage Write API.

## 1. Reset Environment and Imports
Reset any cached state, configure logging, and import the minimal set of libraries needed by the working example.

In [55]:
import os
import sys
import json
import time
import base64
import logging
from pathlib import Path

import pandas as pd
import grpc_tools.protoc

from google.cloud import bigquery
from google.cloud import bigquery_storage_v1
from google.cloud.bigquery_storage_v1 import types, writer
from google.protobuf import descriptor_pb2
from google.api_core import exceptions

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("storage_write_notebook")


## 2. Implement Working Example Function
Recreate the streaming helpers from the working script, keeping TIMESTAMP columns as strings exactly like the baseline implementation.

In [61]:
        future = append_stream.send(request)
        try:
            future.result()
        except exceptions.InvalidArgument as exc:
            logger.error("AppendRows failed with InvalidArgument: %s", exc)
            if hasattr(exc, "errors"):
                logger.error("Error details: %s", exc.errors)
            logger.error("Exception attributes: %s", dir(exc))
            if hasattr(exc, "details"):
                logger.error("details(): %s", exc.details())
            if getattr(exc, "response", None) is not None:
                logger.error("Response object: %s", exc.response)
            if hasattr(exc, "_details"):
                for detail in exc._details:
                    logger.error("Detail: %s", detail)
            raise


NameError: name 'append_stream' is not defined

## 3. Execute Verification Tests
Run lightweight assertions that confirm the helpers mirror the working example.

In [53]:
class DummyField:
    def __init__(self, name: str, field_type: str, mode: str = "NULLABLE"):
        self.name = name
        self.field_type = field_type
        self.mode = mode


test_schema = [
    DummyField("vehicle_id", "STRING", "REQUIRED"),
    DummyField("timestamp", "TIMESTAMP", "NULLABLE"),
]

# Ensure TIMESTAMP is mapped to string exactly like the working script
assert DATA_TYPE_MAPPING["TIMESTAMP"] == "string"

# Proto file generation should succeed without raising
PROTO_FILE.write_text("")
generate_proto_file(test_schema, PROTO_FILE)
compile_proto(PROTO_FILE)

print("Verification checks passed. Proto generated and compiled successfully.")


INFO:storage_write_notebook:Proto definition written to /tmp/schema.proto
INFO:storage_write_notebook:Proto compiled into /tmp
INFO:storage_write_notebook:Proto compiled into /tmp


Verification checks passed. Proto generated and compiled successfully.


## 4. Inspect Example Outputs
Fetch live data, build the schema, and stream the entire dataset using the working helpers.

In [None]:
# Add the src directory to Python path so we can import from src modules
import sys
sys.path.append('../src')

from ingest.main import run

# Ensure the dynamically generated proto module in /tmp is importable
if "/tmp" not in sys.path:
    sys.path.append("/tmp")

In [12]:
os.environ["PROJECT_ID"] = "regal-dynamo-470908-v9"
os.environ["BQ_DATASET"] = "auckland_data_dev"
os.environ["BUCKET"] = "auckland-data-dev"

# Encode headers as base64 (matching Terraform pattern)
import base64
_headers_dict = {
  'Ocp-Apim-Subscription-Key': '1159c79486524360b17501ad888ee7d6'
}
os.environ["HEADERS"] = base64.b64encode(json.dumps(_headers_dict).encode("utf-8")).decode("utf-8")

os.environ["URL"] = 'https://api.at.govt.nz/realtime/legacy/vehiclelocations'
os.environ["DATASET"] = 'vehicle-positions'
os.environ["SPEC"] = 'rt'
os.environ["RESPONSE_TYPE"] = 'json'

project_id = os.getenv("PROJECT_ID", "regal-dynamo-470908-v9")
dataset_id = os.getenv("BQ_DATASET", "auckland_data_dev")
table_id = os.getenv("BQ_TABLE", "rt_vehicle_positions")
table_fqn = f"{project_id}.{dataset_id}.{table_id}"

In [None]:
df = run({})

INFO:transit_data_pipeline:Starting data ingestion for vehicle-positions...
INFO:transit_data_pipeline:Fetched and processed data for vehicle-positions, size: 608019 bytes.
INFO:transit_data_pipeline:Fetched and processed data for vehicle-positions, size: 608019 bytes.
INFO:transit_data_pipeline:Uploaded real-time data to GCS: rt-vehicle-positions/year=2025/month=10/day=19/hour=20/vehicle-positions-20251019T204009Z.json.gz
INFO:transit_data_pipeline:Uploaded real-time data to GCS: rt-vehicle-positions/year=2025/month=10/day=19/hour=20/vehicle-positions-20251019T204009Z.json.gz
INFO:transit_data_pipeline:Added 2 missing schema fields: ['created_at', 'updated_at'] for schema VehiclePositions
INFO:transit_data_pipeline:Added 2 missing schema fields: ['created_at', 'updated_at'] for schema VehiclePositions
INFO:transit_data_pipeline:Transformed data into DataFrame with 1764 records.
INFO:transit_data_pipeline:Using deduplication mode 'skip_duplicates' for dataset 'vehicle-positions'
INFO:t

In [60]:
bq_client = bigquery.Client(project=project_id)
table = bq_client.get_table(table_fqn)
schema_fields = table.schema

import uuid
_schema_token = uuid.uuid4().hex
PROTO_PACKAGE = f"schema_{_schema_token}"
PROTO_FILE = Path(f"/tmp/schema_{_schema_token}.proto")
SCHEMA_MODULE = f"{PROTO_FILE.stem}_pb2"

generate_proto_file(schema_fields, PROTO_FILE)
compile_proto(PROTO_FILE)

start = time.perf_counter()
rows_written = stream_dataframe(df, schema_fields, project_id, dataset_id, table_id)
duration = time.perf_counter() - start
print(f"Streamed {rows_written} rows to {table_fqn} in {duration:.2f} seconds")


I0000 00:00:1760907665.470437 5148852 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
INFO:storage_write_notebook:Proto definition written to /tmp/schema_12ab618f044743b3825dda08788765ef.proto
INFO:storage_write_notebook:Proto compiled into /tmp
INFO:storage_write_notebook:Proto definition written to /tmp/schema_12ab618f044743b3825dda08788765ef.proto
INFO:storage_write_notebook:Proto compiled into /tmp
I0000 00:00:1760907678.953754 5148852 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1760907678.953754 5148852 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers
E0000 00:00:1760907679.728871 5148852 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1760907679.728871 5148852 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
INFO:storage_write_noteb

InvalidArgument: 400 Errors found while processing rows. Please refer to the row_errors field for details. The list may not be complete because of the size limitations. Entity: projects/regal-dynamo-470908-v9/datasets/auckland_data_dev/tables/rt_vehicle_positions/streams/Cic2YmViNDhlMC0wMDAwLTIzMTQtOTI1MC0zYzI4NmQzNjI3M2E6czE

In [45]:
print(Path(PROTO_FILE).read_text().splitlines()[:20])

['syntax = "proto2";', '', 'package schema;', '', 'message Schema {', '  required string record_id = 1;', '  required string entity_id = 2;', '  required string timestamp = 3;', '  optional sint64 timestamp_s = 4;', '  optional string vehicle_id = 5;', '  optional string vehicle_label = 6;', '  optional string vehicle_license_plate = 7;', '  optional double latitude = 8;', '  optional double longitude = 9;', '  optional double bearing = 10;', '  optional double speed = 11;', '  optional double odometer = 12;', '  optional string occupancy_status = 13;', '  optional string route_id = 14;', '  optional string trip_id = 15;']


In [29]:
# Coerce TIMESTAMP columns to RFC3339 strings to match Storage Write expectations
timestamp_columns = [field.name for field in schema_fields if field.field_type.upper() == "TIMESTAMP" and field.name in df.columns]
for column in timestamp_columns:
    series = pd.to_datetime(df[column], utc=True, errors="coerce")
    fallback = pd.Timestamp.now(tz="UTC")
    series = series.fillna(fallback)
    df[column] = series.dt.strftime("%Y-%m-%dT%H:%M:%S.%f%z")

In [41]:
# Fill defaults for required fields to avoid missing values during serialization
DEFAULT_REQUIRED_VALUES = {
    "odometer": 0.0,
    "occupancy_status": "EMPTY",
    "route_id": "",
    "trip_id": "",
    "direction_id": 0,
    "start_date": "",
    "start_time": "",
    "schedule_relationship": "SCHEDULED",
}

for field in schema_fields:
    if field.mode.upper() != "REQUIRED":
        continue
    default_value = DEFAULT_REQUIRED_VALUES.get(field.name)
    if field.name not in df.columns:
        if default_value is None:
            raise ValueError(f"Required field '{field.name}' missing from DataFrame with no default")
        df[field.name] = default_value
        continue
    if df[field.name].isna().any():
        if default_value is None:
            raise ValueError(f"No default specified for required field '{field.name}' with nulls")
        df[field.name] = df[field.name].fillna(default_value)

In [43]:
required_columns = [
    "odometer",
    "occupancy_status",
    "route_id",
    "trip_id",
    "direction_id",
    "start_date",
    "start_time",
    "schedule_relationship",
]
missing_summary = df[required_columns].isna().sum()
print(missing_summary)
print(df.loc[0, required_columns])


odometer                 1691
occupancy_status          933
route_id                  897
trip_id                   897
direction_id              907
start_date                897
start_time                897
schedule_relationship     897
dtype: int64
odometer                  NaN
occupancy_status          NaN
route_id                  NaN
trip_id                   NaN
direction_id             <NA>
start_date                NaN
start_time                NaN
schedule_relationship     NaN
Name: 0, dtype: object


In [57]:
print(df.head(1).to_dict(orient="records")[0])

{'created_at': '2025-10-19T20:45:10.588943+0000', 'updated_at': '2025-10-19T20:45:10.599080+0000', 'record_id': '49f1b0e5436c4505', 'entity_id': 'dc68702249c73592', 'timestamp': '2025-10-19T20:37:19.000000+0000', 'timestamp_s': 1760906239, 'vehicle_id': '518999538', 'vehicle_label': '', 'vehicle_license_plate': '', 'latitude': -36.621502, 'longitude': 174.793742, 'bearing': 199.0, 'speed': 0.0, 'odometer': nan, 'occupancy_status': nan, 'route_id': nan, 'trip_id': nan, 'direction_id': None, 'start_date': nan, 'start_time': nan, 'schedule_relationship': nan}


In [50]:
print([(field.name, field.field_type, field.mode) for field in schema_fields if field.name in required_columns])


[('odometer', 'FLOAT', 'NULLABLE'), ('occupancy_status', 'STRING', 'NULLABLE'), ('route_id', 'STRING', 'NULLABLE'), ('trip_id', 'STRING', 'NULLABLE'), ('direction_id', 'INTEGER', 'NULLABLE'), ('start_date', 'STRING', 'NULLABLE'), ('start_time', 'STRING', 'NULLABLE'), ('schedule_relationship', 'STRING', 'NULLABLE')]


In [31]:
[field.name for field in schema_fields if field.mode.upper() == "REQUIRED"]

['record_id', 'entity_id', 'timestamp']