In [6]:
import os
import json
import re
import logging
import pandas as pd
from google.cloud import firestore, bigquery, storage
from google.api_core.exceptions import GoogleAPICallError
from google.cloud import bigquery
from firebase_admin import credentials, firestore
from dotenv import load_dotenv
from google.auth.exceptions import RefreshError
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from google.oauth2 import service_account

# Load environment variables
load_dotenv()

project_id = os.getenv("GOOGLE_FIRESTORE_PROJECT_ID")
database_name = os.getenv("GOOGLE_FIRESTORE_DATABASE_NAME")
collection_name = os.getenv("GOOGLE_FIRESTORE_COLLECTION_NAME")
bigquery_project_id = os.getenv("GOOGLE_BIGQUERY_PROJECT_ID")
bigquery_dataset_id = os.getenv("GOOGLE_BIGQUERY_DATASET_NAME")
gcp_credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:
# current working directory
print (f"Current working directory: {os.getcwd()}")

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

print (f"Parent directory: {parent_dir}")

# path to the credentials file
gcp_credentials_path = os.path.join(parent_dir, "config/gcp_credentials.json")
print (f"Credentials path: {gcp_credentials_path}")

# path to query file
query_file_path = os.path.join(parent_dir, "database/create_tables.sql")
print (f"Query file path: {query_file_path}")

Current working directory: /workspaces/shipment-tracking-project/notebooks
Parent directory: /workspaces/shipment-tracking-project
Credentials path: /workspaces/shipment-tracking-project/config/gcp_credentials.json
Query file path: /workspaces/shipment-tracking-project/database/create_tables.sql


In [8]:
# Check if the credentials file exists
if not os.path.isfile(gcp_credentials_path):
    print(f"Error: Credentials file not found at {gcp_credentials_path}")
    exit(1)

# Print if the file cotain json
try:
    with open(gcp_credentials_path, 'r') as f:
        json_data = json.load(f)
        logging.info("Credentials file contains valid JSON.")
except json.JSONDecodeError:
    logging.error(f"Credentials file is not valid JSON.")
    exit(1)

2025-03-01 14:06:37,941 - INFO - Credentials file contains valid JSON.


In [9]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure credentials file exists
if not os.path.exists(gcp_credentials_path):
    logging.error(f"Failed to initialize Firestore and BigQuery: File {gcp_credentials_path} was not found.")
    exit(1)

In [10]:
# Initialize BigQuery client
try:
    client = bigquery.Client.from_service_account_json(gcp_credentials_path)
    logging.info("BigQuery client initialized successfully!")
except RefreshError as e:
    logging.error(f"Failed to initialize BigQuery due to authentication error: {e}")
    exit(1)
except Exception as e:
    logging.error(f"Failed to initialize BigQuery: {e}")
    exit(1)

# Get table names from BigQuery dataset
try:
    query = f"SELECT table_name FROM `{bigquery_project_id}.{bigquery_dataset_id}.INFORMATION_SCHEMA.TABLES`"
    tables = client.query(query).to_dataframe()
    table_names = tables["table_name"].tolist()
    logging.info(f"Tables in BigQuery dataset: {table_names}")
except GoogleAPICallError as e:
    logging.error(f"BigQuery API error: {e}")
    exit(1)

2025-03-01 14:06:37,993 - INFO - BigQuery client initialized successfully!


2025-03-01 14:06:40,111 - INFO - Tables in BigQuery dataset: ['DocumentReferences', 'EquipmentEvent', 'Seals', 'TransportCall', 'EventLocation']


In [11]:
# Initialize Firestore client
try:
    credentials = service_account.Credentials.from_service_account_file(gcp_credentials_path)
    db = firestore.Client(project=project_id, database= database_name, credentials=credentials)
    logging.info("Firestore client initialized successfully!")
except RefreshError as e:
    logging.error(f"Failed to initialize Firestore due to authentication error: {e}")
    exit(1)
except Exception as e:
    logging.error(f"Failed to initialize Firestore: {e}")
    exit(1)

2025-03-01 14:06:40,158 - INFO - Firestore client initialized successfully!


In [12]:
# Read 5 random documents from Firestore collection
random_documents = db.collection(collection_name).limit(5).get()

# Take those to JSON format
json_data = [doc.to_dict() for doc in random_documents]

In [13]:
# Initialize Spark session
spark = SparkSession.builder.appName("NestedJSONProcessing").getOrCreate()

25/03/01 14:06:41 WARN Utils: Your hostname, codespaces-f5c61a resolves to a loopback address: 127.0.0.1; using 10.0.0.183 instead (on interface eth0)
25/03/01 14:06:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/01 14:06:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [14]:
# Create a DataFrame from the JSON data
df = spark.read.json(spark.sparkContext.parallelize([json_data]), multiLine=True)
df.printSchema()

[Stage 0:>                                                        (0 + 16) / 16]

root
 |-- documentReferences: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- documentReferenceType: string (nullable = true)
 |    |    |-- documentReferenceValue: string (nullable = true)
 |-- equipmentEventTypeCode: string (nullable = true)
 |-- equipmentReference: string (nullable = true)
 |-- eventClassifierCode: string (nullable = true)
 |-- eventCreatedDateTime: string (nullable = true)
 |-- eventDateTime: string (nullable = true)
 |-- eventID: string (nullable = true)
 |-- eventLocation: struct (nullable = true)
 |    |-- address: struct (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- floor: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- postCode: string (nullable = true)
 |    |    |-- stateRegion: string (nullable = true)
 |    |    |-- street: string (nullable = true)
 |    |    |-- streetNumber: string (nullable = true

                                                                                

In [15]:
# Flatten the main event fields
df_events = df.select(
    col("eventID").alias("event_id"),
    col("eventCreatedDateTime").alias("event_created_datetime"),
    col("eventClassifierCode").alias("event_classifier_code"),
    col("equipmentEventTypeCode").alias("equipment_event_type_code"),
    col("equipmentReference").alias("equipment_reference"),
    col("eventDateTime").alias("event_datetime"),
    col("eventType").alias("event_type")
)

df_events.show(truncate=False)

+------------------------------------+--------------------------+---------------------+-------------------------+-------------------+--------------------------+----------+
|event_id                            |event_created_datetime    |event_classifier_code|equipment_event_type_code|equipment_reference|event_datetime            |event_type|
+------------------------------------+--------------------------+---------------------+-------------------------+-------------------+--------------------------+----------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|2024-06-20T08:12:59.656226|ACT                  |UNLOAD                   |APZU2386924        |2024-06-20T08:12:59.656226|EQUIPMENT |
|00052089-881b-4774-9131-cef97a6e9e7f|2024-05-13T08:12:59.276063|EST                  |UNLOAD                   |APZU3785346        |2024-05-13T08:12:59.276063|EQUIPMENT |
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|2024-04-18T08:13:00.066059|EST                  |ARRIVAL                  |APZU7792130        |2024-04

In [16]:
# Explode documentReferences
df_document_references = df.select(
    col("eventID").alias("event_id"),
    explode(col("documentReferences")).alias("document_reference")
).select(
    col("event_id"),
    col("document_reference.documentReferenceType").alias("document_reference_type"),
    col("document_reference.documentReferenceValue").alias("document_reference_value")
)

df_document_references.show(truncate=False)

+------------------------------------+-----------------------+------------------------------------+
|event_id                            |document_reference_type|document_reference_value            |
+------------------------------------+-----------------------+------------------------------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|BL                     |b510ce35-82e2-418c-be24-23537f216701|
|00052089-881b-4774-9131-cef97a6e9e7f|BL                     |ca6d86d6-e81a-4813-8e25-8a69e93cdaba|
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|BL                     |e231f39f-837c-4c5c-a55d-ebbd1092398c|
|0007d2dd-d688-4f65-bd85-1e01c8541bba|BL                     |51770229-8fb7-4874-b277-05916fbd81d4|
|0007f87a-8748-466c-bd12-d1df687bd662|BL                     |bb6d6842-690d-491f-a639-1ab7de72e4d7|
+------------------------------------+-----------------------+------------------------------------+



In [17]:

# Explode seals
df_seals = df.select(
    col("eventID").alias("event_id"),
    explode(col("seals")).alias("seal")
).select(
    col("event_id"),
    col("seal.sealType").alias("seal_type"),
    col("seal.sealNumber").alias("seal_number"),
    col("seal.sealSource").alias("seal_source")
)

df_seals.show(truncate=False)

+------------------------------------+---------+------------------------------------+-----------+
|event_id                            |seal_type|seal_number                         |seal_source|
+------------------------------------+---------+------------------------------------+-----------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|STANDARD |a09ba28e-0391-4964-92f3-fca907667ae3|CUSTOMS    |
|00052089-881b-4774-9131-cef97a6e9e7f|STANDARD |52fcfbe8-87da-4ba8-9e35-526fc2bfdf7e|CUSTOMS    |
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|STANDARD |35e8b99c-061a-4c18-8e22-f8630355935c|CARRIER    |
|0007d2dd-d688-4f65-bd85-1e01c8541bba|STANDARD |1fc4b733-c92f-431e-b4b8-5cfdb3ea2027|CUSTOMS    |
|0007f87a-8748-466c-bd12-d1df687bd662|STANDARD |2c6e9d52-b123-4de9-8be4-e5bc4f5fa4cc|CARRIER    |
+------------------------------------+---------+------------------------------------+-----------+



In [18]:
# Extract eventLocation
df_event_location = df.select(
    col("eventID").alias("event_id"),
    col("eventLocation.facilityCodeListProvider").alias("facility_code_list_provider"),
    col("eventLocation.longitude").alias("longitude"),
    col("eventLocation.unlocationCode").alias("unlocation_code"),
    col("eventLocation.facilityCode").alias("facility_code"),
    col("eventLocation.latitude").alias("latitude"),
    col("eventLocation.locationName").alias("location_name"),
    col("eventLocation.address.stateRegion").alias("state_region"),
    col("eventLocation.address.streetNumber").alias("street_number"),
    col("eventLocation.address.street").alias("street"),
    col("eventLocation.address.country").alias("country"),
    col("eventLocation.address.floor").alias("floor"),
    col("eventLocation.address.city").alias("city"),
    col("eventLocation.address.postCode").alias("post_code"),
    col("eventLocation.address.name").alias("name")
)

df_event_location.show(truncate=False)


+------------------------------------+---------------------------+-----------+---------------+-------------+----------+-------------+------------+-------------+-----------+-------+-----+---------+---------+------+
|event_id                            |facility_code_list_provider|longitude  |unlocation_code|facility_code|latitude  |location_name|state_region|street_number|street     |country|floor|city     |post_code|name  |
+------------------------------------+---------------------------+-----------+---------------+-------------+----------+-------------+------------+-------------+-----------+-------+-----+---------+---------+------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|SMDG                       |-93.41878  |XYZ123         |XYZ          |-25.481948|Port Terminal|N/A         |21           |Main Street|UK     |8F   |København|7462     |Henrik|
|00052089-881b-4774-9131-cef97a6e9e7f|SMDG                       |11.40495   |XYZ123         |XYZ          |71.330214 |Port Terminal|Region B   

In [19]:
# Extract transportCall
df_transport_call = df.select(
    col("eventID").alias("event_id"),
    col("transportCall.facilityCodeListProvider").alias("facility_code_list_provider"),
    col("transportCall.carrierServiceCode").alias("carrier_service_code"),
    col("transportCall.transportCallID").alias("transport_call_id"),
    col("transportCall.modeOfTransport").alias("mode_of_transport"),
    col("transportCall.importVoyageNumber").alias("import_voyage_number"),
    col("transportCall.facilityTypeCode").alias("facility_type_code"),
    col("transportCall.facilityCode").alias("facility_code"),
    col("transportCall.exportVoyageNumber").alias("export_voyage_number")
)

df_transport_call.show(truncate=False)

+------------------------------------+---------------------------+--------------------+------------------------------------+-----------------+--------------------+------------------+-------------+--------------------+
|event_id                            |facility_code_list_provider|carrier_service_code|transport_call_id                   |mode_of_transport|import_voyage_number|facility_type_code|facility_code|export_voyage_number|
+------------------------------------+---------------------------+--------------------+------------------------------------+-----------------+--------------------+------------------+-------------+--------------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|SMDG                       |FE1                 |0c582809-f5b3-4641-bf50-c5f724ed11f0|SEA              |3925N               |PORT              |XYZ          |5256S               |
|00052089-881b-4774-9131-cef97a6e9e7f|SMDG                       |FE1                 |d2ab6d03-4b1c-4733-9e5a-8d9b94095a7f|SEA 

In [20]:
# Extract vessel (nested inside transportCall)
df_vessel = df.select(
    col("eventID").alias("event_id"),
    col("transportCall.vessel.vesselCallSignNumber").alias("vessel_call_sign_number"),
    col("transportCall.vessel.vesselOperatorCarrierCode").alias("vessel_operator_carrier_code"),
    col("transportCall.vessel.vesselOperatorCarrierCodeListProvider").alias("vessel_operator_carrier_code_list_provider"),
    col("transportCall.vessel.vesselIMONumber").alias("vessel_imo_number"),
    col("transportCall.vessel.vesselFlag").alias("vessel_flag"),
    col("transportCall.vessel.vesselName").alias("vessel_name")
)

df_vessel.show(truncate=False)

+------------------------------------+-----------------------+----------------------------+------------------------------------------+-----------------+-----------+--------------+
|event_id                            |vessel_call_sign_number|vessel_operator_carrier_code|vessel_operator_carrier_code_list_provider|vessel_imo_number|vessel_flag|vessel_name   |
+------------------------------------+-----------------------+----------------------------+------------------------------------------+-----------------+-----------+--------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|NCVV                   |MAEU                        |NMFTA                                     |9323208          |DE         |Sea Guardian  |
|00052089-881b-4774-9131-cef97a6e9e7f|NCVV                   |MAEU                        |NMFTA                                     |9487386          |DE         |Ocean Explorer|
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|NCVV                   |MAEU                        |NMFTA    

In [21]:
# Add foreign keys to df_events
df_events = df_events.withColumnRenamed("event_id", "eventID") \
    .withColumn("documentReferenceID", col("eventID")) \
    .withColumn("eventLocationID", col("eventID")) \
    .withColumn("sealID", col("eventID")) \
    .withColumn("transportCallID", col("eventID"))

df_events.show(truncate=False)

+------------------------------------+--------------------------+---------------------+-------------------------+-------------------+--------------------------+----------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+
|eventID                             |event_created_datetime    |event_classifier_code|equipment_event_type_code|equipment_reference|event_datetime            |event_type|documentReferenceID                 |eventLocationID                     |sealID                              |transportCallID                     |
+------------------------------------+--------------------------+---------------------+-------------------------+-------------------+--------------------------+----------+------------------------------------+------------------------------------+------------------------------------+------------------------------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|20

In [22]:
# Add documentReferenceID (primary key)
df_document_references = df_document_references.withColumnRenamed("event_id", "documentReferenceID")

df_document_references.show(truncate=False)

+------------------------------------+-----------------------+------------------------------------+
|documentReferenceID                 |document_reference_type|document_reference_value            |
+------------------------------------+-----------------------+------------------------------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|BL                     |b510ce35-82e2-418c-be24-23537f216701|
|00052089-881b-4774-9131-cef97a6e9e7f|BL                     |ca6d86d6-e81a-4813-8e25-8a69e93cdaba|
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|BL                     |e231f39f-837c-4c5c-a55d-ebbd1092398c|
|0007d2dd-d688-4f65-bd85-1e01c8541bba|BL                     |51770229-8fb7-4874-b277-05916fbd81d4|
|0007f87a-8748-466c-bd12-d1df687bd662|BL                     |bb6d6842-690d-491f-a639-1ab7de72e4d7|
+------------------------------------+-----------------------+------------------------------------+



In [23]:
# Add eventLocationID (primary key)
df_event_location = df_event_location.withColumnRenamed("event_id", "eventLocationID")

df_event_location.show(truncate=False)

+------------------------------------+---------------------------+-----------+---------------+-------------+----------+-------------+------------+-------------+-----------+-------+-----+---------+---------+------+
|eventLocationID                     |facility_code_list_provider|longitude  |unlocation_code|facility_code|latitude  |location_name|state_region|street_number|street     |country|floor|city     |post_code|name  |
+------------------------------------+---------------------------+-----------+---------------+-------------+----------+-------------+------------+-------------+-----------+-------+-----+---------+---------+------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|SMDG                       |-93.41878  |XYZ123         |XYZ          |-25.481948|Port Terminal|N/A         |21           |Main Street|UK     |8F   |København|7462     |Henrik|
|00052089-881b-4774-9131-cef97a6e9e7f|SMDG                       |11.40495   |XYZ123         |XYZ          |71.330214 |Port Terminal|Region B   

In [24]:
# Add sealID (primary key)
df_seals = df_seals.withColumnRenamed("event_id", "sealID")

df_seals.show(truncate=False)

+------------------------------------+---------+------------------------------------+-----------+
|sealID                              |seal_type|seal_number                         |seal_source|
+------------------------------------+---------+------------------------------------+-----------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|STANDARD |a09ba28e-0391-4964-92f3-fca907667ae3|CUSTOMS    |
|00052089-881b-4774-9131-cef97a6e9e7f|STANDARD |52fcfbe8-87da-4ba8-9e35-526fc2bfdf7e|CUSTOMS    |
|0005ee8f-8e98-4015-bd9c-5165a17c66e2|STANDARD |35e8b99c-061a-4c18-8e22-f8630355935c|CARRIER    |
|0007d2dd-d688-4f65-bd85-1e01c8541bba|STANDARD |1fc4b733-c92f-431e-b4b8-5cfdb3ea2027|CUSTOMS    |
|0007f87a-8748-466c-bd12-d1df687bd662|STANDARD |2c6e9d52-b123-4de9-8be4-e5bc4f5fa4cc|CARRIER    |
+------------------------------------+---------+------------------------------------+-----------+



In [25]:
# Add transportCallID (primary key)
df_transport_call = df_transport_call.withColumnRenamed("event_id", "transportCallID")

df_transport_call.show(truncate=False)

+------------------------------------+---------------------------+--------------------+------------------------------------+-----------------+--------------------+------------------+-------------+--------------------+
|transportCallID                     |facility_code_list_provider|carrier_service_code|transport_call_id                   |mode_of_transport|import_voyage_number|facility_type_code|facility_code|export_voyage_number|
+------------------------------------+---------------------------+--------------------+------------------------------------+-----------------+--------------------+------------------+-------------+--------------------+
|00006ba7-4179-4b4d-a8b9-7020e4227ff4|SMDG                       |FE1                 |0c582809-f5b3-4641-bf50-c5f724ed11f0|SEA              |3925N               |PORT              |XYZ          |5256S               |
|00052089-881b-4774-9131-cef97a6e9e7f|SMDG                       |FE1                 |d2ab6d03-4b1c-4733-9e5a-8d9b94095a7f|SEA 

25/03/01 14:06:53 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
