## Project Setup

In [1]:
# Add the src directory to Python path so we can import from src modules
import sys
sys.path.append('../src')

In [2]:
import os
import pandas as pd
from google.cloud import bigquery
from google.cloud.bigquery_storage_v1 import BigQueryWriteClient
import json

In [3]:
os.environ["PROJECT_ID"] = "regal-dynamo-470908-v9"
os.environ["BQ_DATASET"] = "auckland_data_dev"
os.environ["BUCKET"] = "auckland-data-dev"

# Encode headers as base64 (matching Terraform pattern)
import base64
_headers_dict = {
  'Ocp-Apim-Subscription-Key': '1159c79486524360b17501ad888ee7d6'
}
os.environ["HEADERS"] = base64.b64encode(json.dumps(_headers_dict).encode("utf-8")).decode("utf-8")

os.environ["URL"] = 'https://api.at.govt.nz/realtime/legacy/vehiclelocations'
os.environ["DATASET"] = 'vehicle-positions'
os.environ["SPEC"] = 'rt'
os.environ["RESPONSE_TYPE"] = 'json'

In [4]:
project_id = "regal-dynamo-470908-v9"
dataset = "auckland_data_dev"
table_name = 'rt_vehicle_positions'
table_id = f"{project_id}.{dataset}.{table_name}"

In [5]:
from ingest.main import run
df = run({})

INFO - Starting data ingestion for vehicle-positions...
INFO - Fetched and processed data for vehicle-positions, size: 657802 bytes.
INFO - Uploaded real-time data to GCS: rt-vehicle-positions/year=2025/month=10/day=19/hour=20/vehicle-positions-20251019T200449Z.json.gz
INFO - Added 2 missing schema fields: ['created_at', 'updated_at'] for schema VehiclePositions
INFO - Transformed data into DataFrame with 1861 records.
INFO - Using deduplication mode 'skip_duplicates' for dataset 'vehicle-positions'
INFO - Stopped prior to BigQuery upload


## Step 1 Build Client

In [6]:
# 1. Create a standard BigQuery client
bq_client = bigquery.Client(project=project_id)

# 2. Get the table object
table = bq_client.get_table(table_id)
schema_fields = table.schema

# 3. Create the Write API client
write_client = BigQueryWriteClient()
print("\nBigQueryWriteClient initialized successfully.")


BigQueryWriteClient initialized successfully.


E0000 00:00:1760904294.670156 5127053 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


## Step 2 - Dynamically Generate a Protobuf Schema

In [7]:
from google.protobuf import descriptor_pb2

def bq_schema_to_proto_schema(
    bq_schema: list[bigquery.SchemaField],
) -> descriptor_pb2.FileDescriptorProto:
    """Converts a BigQuery schema to a Protobuf FileDescriptorProto."""
    
    type_map = {
        "STRING": descriptor_pb2.FieldDescriptorProto.TYPE_STRING,
        "BYTES": descriptor_pb2.FieldDescriptorProto.TYPE_BYTES,
        "INTEGER": descriptor_pb2.FieldDescriptorProto.TYPE_INT64,
        "FLOAT": descriptor_pb2.FieldDescriptorProto.TYPE_DOUBLE,
        "BOOLEAN": descriptor_pb2.FieldDescriptorProto.TYPE_BOOL,
        "TIMESTAMP": descriptor_pb2.FieldDescriptorProto.TYPE_STRING, # Treat Timestamp as String
    }

    # Create a FileDescriptorProto to hold the message.
    file_descriptor_proto = descriptor_pb2.FileDescriptorProto()
    file_descriptor_proto.name = 'my_schema.proto'
    file_descriptor_proto.package = 'gcp_example'

    # Create the message descriptor for the row data.
    proto_descriptor = file_descriptor_proto.message_type.add()
    proto_descriptor.name = "RowData"

    for i, field in enumerate(bq_schema):
        proto_field = proto_descriptor.field.add()
        proto_field.name = field.name
        proto_field.number = i + 1
        proto_field.label = descriptor_pb2.FieldDescriptorProto.LABEL_OPTIONAL
        
        field_type = field.field_type.upper()
        if field_type not in type_map:
            raise ValueError(f"Unsupported BigQuery type: {field.field_type}")
            
        proto_field.type = type_map[field_type]

    # No longer need to handle timestamp dependencies
    return file_descriptor_proto

# --- Action & Verification ---
try:
    # This now returns a FileDescriptorProto
    file_descriptor_proto = bq_schema_to_proto_schema(schema_fields)
    
    # We still need the message descriptor for the writer_schema
    proto_schema_descriptor = file_descriptor_proto.message_type[0]

    print("Successfully generated Protobuf file and message descriptors.")
except Exception as e:
    print(f"Error generating proto schema: {e}")

Successfully generated Protobuf file and message descriptors.


## Step 3 - Serialize a Single DataFrame Row

In [8]:
from google.protobuf import message_factory, descriptor_pool
import sys

In [9]:
# Check empty df values
default_values = {
    'bearing': 0, 'odometer': 0, 'occupancy_status': 'EMPTY', 'route_id': '',
    'trip_id': '', 'direction_id': 0, 'start_date': '', 'start_time': '',
    'schedule_relationship': 'SCHEDULED', 'vehicle_license_plate': ''
}
df.fillna(value=default_values, inplace=True)

In [10]:
# 1. Create a FileDescriptorProto to hold our message descriptor.
file_descriptor_proto = descriptor_pb2.FileDescriptorProto()
file_descriptor_proto.name = 'my_schema.proto'
file_descriptor_proto.package = 'gcp_example'

# 2. Add the message descriptor (our RowData) to the file descriptor.
file_descriptor_proto.message_type.append(proto_schema_descriptor)

# 3. If the schema uses TIMESTAMP, add the required dependency.
if any(field.field_type.upper() == "TIMESTAMP" for field in schema_fields):
    if 'google/protobuf/timestamp.proto' not in file_descriptor_proto.dependency:
        file_descriptor_proto.dependency.append('google/protobuf/timestamp.proto')

# 4. Add the fully-formed file descriptor to the pool.
pool = descriptor_pool.Default()
pool.Add(file_descriptor_proto)

<google._upb._message.FileDescriptor at 0x128f94830>

## Step 4 - Serialize Rows

In [11]:
# 1. Serialize all rows
Message = message_factory.GetMessageClass(pool.FindMessageTypeByName("gcp_example.RowData"))

serialized_rows = []
for _, row in df.iterrows():
    msg = Message()
    for field in schema_fields:
        name = field.name
        if name not in row or pd.isna(row[name]):
            continue

        value = row[name]
        
        # All special timestamp logic is removed.
        # The ingest function already provides datetime objects as strings.
        if hasattr(value, 'item'):
            value = value.item()
        
        # For Timestamps, ensure they are strings
        if isinstance(value, pd.Timestamp):
            value = value.isoformat()

        setattr(msg, name, value)
            
    serialized_rows.append(msg.SerializeToString())

# --- Verification ---
print(f"Successfully serialized {len(serialized_rows)} rows.")
if serialized_rows:
    print(f"Example serialized row (first 50 bytes): {serialized_rows[0][:50]}")

Successfully serialized 1861 rows.
Example serialized row (first 50 bytes): b'\n\x10308cc1f06b6854e4\x12\x1039b3621e78965d22\x1a\x192025-10-19T2'


## Step 5 - Create Stream and Append Rows

In [12]:
from google.cloud.bigquery_storage_v1 import types
import time
from google.protobuf import descriptor_pb2

In [13]:
# --- Action 1: Create a write stream ---
parent = write_client.table_path(project_id, dataset, table_name)
write_stream = types.WriteStream()
write_stream.type_ = types.WriteStream.Type.COMMITTED

try:
    write_stream = write_client.create_write_stream(
        parent=parent, write_stream=write_stream
    )
    stream_name = write_stream.name
    print(f"Successfully created write stream: {stream_name}")

    # --- Action 2: Append the serialized rows ---
    request = types.AppendRowsRequest()
    request.write_stream = stream_name
    
    # The writer_schema should contain the DescriptorProto (the message definition),
    # not the FileDescriptorProto. The dependency is resolved by the backend
    # because the full definition was added to the pool before serialization.
    proto_schema = types.ProtoSchema()
    proto_schema.proto_descriptor = proto_schema_descriptor

    # Package the schema and the rows into a ProtoData object
    proto_data = types.AppendRowsRequest.ProtoData()
    proto_data.writer_schema = proto_schema
    proto_data.rows.serialized_rows.extend(serialized_rows)
    
    # Assign the ProtoData object to the request
    request.proto_rows = proto_data

    # Send the request
    print("Appending rows...")
    start_time = time.time()
    
    responses = write_client.append_rows(iter([request]))
    
    # --- Verification ---
    for response in responses:
        if not response.error:
            print(f"Successfully appended {len(serialized_rows)} rows in {time.time() - start_time:.2f} seconds.")
        else:
            print(f"Error appending rows: {response.error.message}")
            
except Exception as e:
    print(f"An error occurred: {e}")

Successfully created write stream: projects/regal-dynamo-470908-v9/datasets/auckland_data_dev/tables/rt_vehicle_positions/streams/Cig2YmU2MWNmZS0wMDAwLTIzMTQtOTI1MC0zYzI4NmQzNjI3M2E6czEx
Appending rows...
Error appending rows: 
