In [33]:
# Import Necessary Libraries
import os
import json
import logging
import polars as pl
from google.cloud import firestore, bigquery, storage
from google.api_core.exceptions import GoogleAPICallError
from firebase_admin import credentials, firestore
from dotenv import load_dotenv
from google.auth.exceptions import RefreshError
from google.oauth2 import service_account
import hashlib

# Load environment variables
load_dotenv()

project_id = os.getenv("GOOGLE_FIRESTORE_PROJECT_ID")
database_name = os.getenv("GOOGLE_FIRESTORE_DATABASE_NAME")
collection_name = os.getenv("GOOGLE_FIRESTORE_COLLECTION_NAME")
bigquery_project_id = os.getenv("GOOGLE_BIGQUERY_PROJECT_ID")
bigquery_dataset_id = os.getenv("GOOGLE_BIGQUERY_DATASET_NAME")
gcp_credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

# Configure Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure the Credentials file exists
print(f"Current working directory: {os.getcwd()}")

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
print(f"Parent directory: {parent_dir}")

# Path to the credentials file
gcp_credentials_path = os.path.join(parent_dir, "config/gcp_credentials.json")
print(f"Credentials path: {gcp_credentials_path}")

# Path to query file
query_file_path = os.path.join(parent_dir, "database/create_tables.sql")
print(f"Query file path: {query_file_path}")

# Check if the credentials file exists
if not os.path.isfile(gcp_credentials_path):
    print(f"Error: Credentials file not found at {gcp_credentials_path}")
    exit(1)

# Print if the file contains valid JSON
try:
    with open(gcp_credentials_path, 'r') as f:
        json_data = json.load(f)
        logging.info("Credentials file contains valid JSON.")
except json.JSONDecodeError:
    logging.error(f"Credentials file is not valid JSON.")
    exit(1)

# Ensure credentials file exists
if not os.path.exists(gcp_credentials_path):
    logging.error(f"Failed to initialize Firestore and BigQuery: File {gcp_credentials_path} was not found.")
    exit(1)

# Initialize BigQuery Client
try:
    client = bigquery.Client.from_service_account_json(gcp_credentials_path)
    logging.info("BigQuery client initialized successfully!")
except RefreshError as e:
    logging.error(f"Failed to initialize BigQuery due to authentication error: {e}")
    exit(1)
except Exception as e:
    logging.error(f"Failed to initialize BigQuery: {e}")
    exit(1)

# Get table names from BigQuery dataset
try:
    query = f"SELECT table_name FROM `{bigquery_project_id}.{bigquery_dataset_id}.INFORMATION_SCHEMA.TABLES`"
    tables = client.query(query).to_dataframe()
    table_names = tables["table_name"].tolist()
    logging.info(f"Tables in BigQuery dataset: {table_names}")
except GoogleAPICallError as e:
    logging.error(f"BigQuery API error: {e}")
    exit(1)

# Initialize Firestore Client
try:
    credentials = service_account.Credentials.from_service_account_file(gcp_credentials_path)
    db = firestore.Client(project=project_id, database=database_name, credentials=credentials)
    logging.info("Firestore client initialized successfully!")
except RefreshError as e:
    logging.error(f"Failed to initialize Firestore due to authentication error: {e}")
    exit(1)
except Exception as e:
    logging.error(f"Failed to initialize Firestore: {e}")
    exit(1)

# Read 5 random documents from Firestore collection
random_documents = db.collection(collection_name).get()

2025-03-04 22:23:05,529 - INFO - Credentials file contains valid JSON.
2025-03-04 22:23:05,587 - INFO - BigQuery client initialized successfully!


Current working directory: c:\Users\RameezRassdeen\Documents\shipment-tracking-project\notebooks
Parent directory: c:\Users\RameezRassdeen\Documents\shipment-tracking-project
Credentials path: c:\Users\RameezRassdeen\Documents\shipment-tracking-project\config/gcp_credentials.json
Query file path: c:\Users\RameezRassdeen\Documents\shipment-tracking-project\database/create_tables.sql


2025-03-04 22:23:09,970 - INFO - Tables in BigQuery dataset: ['DocumentReferences', 'EquipmentEvent', 'Seals', 'Address', 'Vessel', 'Facility', 'TransportCall', 'EventLocation']
2025-03-04 22:23:10,027 - INFO - Firestore client initialized successfully!


In [34]:
# Convert documents to JSON format
json_data = [doc.to_dict() for doc in random_documents]

# Create a Polars DataFrame from the JSON data
df = pl.DataFrame(json_data)

# Print schema to verify structure
print(df.schema)

Schema({'eventCreatedDateTime': String, 'eventClassifierCode': String, 'equipmentEventTypeCode': String, 'documentReferences': List(Struct({'documentReferenceType': String, 'documentReferenceValue': String})), 'eventLocation': Struct({'facilityCodeListProvider': String, 'longitude': String, 'unlocationCode': String, 'address': Struct({'stateRegion': String, 'streetNumber': String, 'street': String, 'country': String, 'floor': String, 'city': String, 'postCode': String, 'name': String}), 'facilityCode': String, 'latitude': String, 'locationName': String}), 'eventID': String, 'seals': List(Struct({'sealType': String, 'sealNumber': String, 'sealSource': String})), 'equipmentReference': String, 'transportCall': Struct({'facilityCodeListProvider': String, 'vessel': Struct({'vesselCallSignNumber': String, 'vesselOperatorCarrierCode': String, 'vesselOperatorCarrierCodeListProvider': String, 'vesselIMONumber': String, 'vesselFlag': String, 'vesselName': String}), 'carrierServiceCode': String, 

In [35]:
df

eventCreatedDateTime,eventClassifierCode,equipmentEventTypeCode,documentReferences,eventLocation,eventID,seals,equipmentReference,transportCall,eventDateTime,eventType
str,str,str,list[struct[2]],struct[7],str,list[struct[3]],str,struct[9],str,str
"""2024-06-20T08:12:59.656226""","""ACT""","""UNLOAD""","[{""BL"",""b510ce35-82e2-418c-be24-23537f216701""}]","{""SMDG"",""-93.41878"",""XYZ123"",{""N/A"",""21"",""Main Street"",""UK"",""8F"",""København"",""7462"",""Henrik""},""XYZ"",""-25.481948"",""Port Terminal""}","""00006ba7-4179-4b4d-a8b9-7020e4…","[{""STANDARD"",""a09ba28e-0391-4964-92f3-fca907667ae3"",""CUSTOMS""}]","""APZU2386924""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9323208"",""DE"",""Sea Guardian""},""FE1"",""0c582809-f5b3-4641-bf50-c5f724ed11f0"",""SEA"",""3925N"",""PORT"",""XYZ"",""5256S""}","""2024-06-20T08:12:59.656226""","""EQUIPMENT"""
"""2024-05-13T08:12:59.276063""","""EST""","""UNLOAD""","[{""BL"",""ca6d86d6-e81a-4813-8e25-8a69e93cdaba""}]","{""SMDG"",""11.40495"",""XYZ123"",{""Region B"",""157"",""Main Street"",""UK"",""5F"",""København"",""1757"",""Henrik""},""XYZ"",""71.330214"",""Port Terminal""}","""00052089-881b-4774-9131-cef97a…","[{""STANDARD"",""52fcfbe8-87da-4ba8-9e35-526fc2bfdf7e"",""CUSTOMS""}]","""APZU3785346""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9487386"",""DE"",""Ocean Explorer""},""FE1"",""d2ab6d03-4b1c-4733-9e5a-8d9b94095a7f"",""SEA"",""5221N"",""PORT"",""ADT"",""6348S""}","""2024-05-13T08:12:59.276063""","""EQUIPMENT"""
"""2024-04-18T08:13:00.066059""","""EST""","""ARRIVAL""","[{""BL"",""e231f39f-837c-4c5c-a55d-ebbd1092398c""}]","{""SMDG"",""64.146556"",""XYZ123"",{""Region B"",""101"",""Main Street"",""Germany"",""10F"",""Berlin"",""6471"",""Henrik""},""ADT"",""24.834957"",""Port Terminal""}","""0005ee8f-8e98-4015-bd9c-5165a1…","[{""STANDARD"",""35e8b99c-061a-4c18-8e22-f8630355935c"",""CARRIER""}]","""APZU7792130""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9948579"",""DE"",""Ocean Explorer""},""FE1"",""3ac370c1-9a95-41a1-8e4f-f6a0e18b8fe7"",""SEA"",""4516N"",""PORT"",""PQR"",""7942S""}","""2024-04-18T08:13:00.066059""","""EQUIPMENT"""
"""2024-11-16T08:12:59.493765""","""EST""","""LOAD""","[{""BL"",""51770229-8fb7-4874-b277-05916fbd81d4""}]","{""SMDG"",""-155.880456"",""XYZ123"",{""Region A"",""13"",""Main Street"",""Norway"",""9F"",""London"",""6673"",""Henrik""},""PQR"",""68.375913"",""Port Terminal""}","""0007d2dd-d688-4f65-bd85-1e01c8…","[{""STANDARD"",""1fc4b733-c92f-431e-b4b8-5cfdb3ea2027"",""CUSTOMS""}]","""APZU3279567""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9606882"",""DE"",""Ocean Explorer""},""FE1"",""01c92dab-eb1b-4ad8-b2af-67b557af4dbf"",""SEA"",""9859N"",""PORT"",""ADT"",""5245S""}","""2024-11-16T08:12:59.493765""","""EQUIPMENT"""
"""2024-03-13T08:12:59.213024""","""EST""","""DEPARTURE""","[{""BL"",""bb6d6842-690d-491f-a639-1ab7de72e4d7""}]","{""SMDG"",""104.079924"",""XYZ123"",{""N/A"",""63"",""Main Street"",""Denmark"",""6F"",""København"",""3242"",""Henrik""},""ADT"",""-55.313004"",""Port Terminal""}","""0007f87a-8748-466c-bd12-d1df68…","[{""STANDARD"",""2c6e9d52-b123-4de9-8be4-e5bc4f5fa4cc"",""CARRIER""}]","""APZU7556078""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9423653"",""DE"",""Ocean Explorer""},""FE1"",""a6d4b08a-9757-4330-ab01-2c1e34674f28"",""SEA"",""9461N"",""PORT"",""PQR"",""2259S""}","""2024-03-13T08:12:59.213024""","""EQUIPMENT"""
…,…,…,…,…,…,…,…,…,…,…
"""2024-09-22T08:12:59.220061""","""ACT""","""ARRIVAL""","[{""BL"",""5e5bc5bd-8495-4a2f-9dce-80741e87bac1""}]","{""SMDG"",""66.478511"",""XYZ123"",{""Region B"",""100"",""Main Street"",""France"",""9F"",""Paris"",""7443"",""Henrik""},""XYZ"",""-49.476927"",""Port Terminal""}","""ffe2ac16-8f09-4c87-bd6d-5c9554…","[{""STANDARD"",""d2b06667-2b93-4fa0-8ca6-299e40b6f9bd"",""CUSTOMS""}]","""APZU5260620""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9440990"",""DE"",""Ocean Explorer""},""FE1"",""ff53ec3f-c64e-473f-8afd-a3155e69deeb"",""SEA"",""4918N"",""PORT"",""XYZ"",""6355S""}","""2024-09-22T08:12:59.220061""","""EQUIPMENT"""
"""2025-02-22T08:13:00.166560""","""ACT""","""UNLOAD""","[{""BL"",""41ade0f3-5b55-4aaa-a27e-f2fb3e30c3e0""}]","{""SMDG"",""-77.32531"",""XYZ123"",{""N/A"",""79"",""Main Street"",""Denmark"",""4F"",""London"",""4569"",""Henrik""},""XYZ"",""-50.745185"",""Port Terminal""}","""ffe4e014-f5b0-490d-ac79-eebd4d…","[{""STANDARD"",""6f08dbe5-5059-4743-9a3a-03b2df558fab"",""CARRIER""}]","""APZU5079207""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9785570"",""DE"",""Wave Rider""},""FE1"",""96784901-0d2a-4aa6-b36c-6a796056274b"",""SEA"",""4932N"",""PORT"",""XYZ"",""6311S""}","""2025-02-22T08:13:00.166560""","""EQUIPMENT"""
"""2024-09-15T08:13:00.193873""","""ACT""","""DEPARTURE""","[{""BL"",""349be168-051c-40ad-8377-c9220045d447""}]","{""SMDG"",""40.349491"",""XYZ123"",{""Region B"",""100"",""Main Street"",""Norway"",""10F"",""Berlin"",""7418"",""Henrik""},""XYZ"",""72.013067"",""Port Terminal""}","""ffe50b7c-21f1-419c-bd0b-8859cf…","[{""STANDARD"",""62043e82-8c49-4a91-9d3c-05a8293c4c99"",""CARRIER""}]","""APZU1179984""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9563652"",""DE"",""Wave Rider""},""FE1"",""48979315-7072-40fc-b982-edb2686dfab4"",""SEA"",""9569N"",""PORT"",""PQR"",""9142S""}","""2024-09-15T08:13:00.193873""","""EQUIPMENT"""
"""2025-02-18T08:13:00.112234""","""ACT""","""LOAD""","[{""BL"",""1eb4396d-a7aa-42be-b022-0c2b892dd7e2""}]","{""SMDG"",""-174.992247"",""XYZ123"",{""Region A"",""37"",""Main Street"",""UK"",""1F"",""København"",""3290"",""Henrik""},""ADT"",""47.593142"",""Port Terminal""}","""ffe62fa1-c87a-4ac0-b017-1b4755…","[{""STANDARD"",""93d105d4-246c-4ba1-921b-fb88a9802969"",""CARRIER""}]","""APZU1986863""","{""SMDG"",{""NCVV"",""MAEU"",""NMFTA"",""9361793"",""DE"",""Sea Guardian""},""FE1"",""8ebd2c4e-930d-45c9-a170-5090432cd109"",""SEA"",""9246N"",""PORT"",""PQR"",""5524S""}","""2025-02-18T08:13:00.112234""","""EQUIPMENT"""


In [36]:
# Flatten the main event fields
df_events = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("eventCreatedDateTime").alias("eventCreatedDateTime"),
    pl.col("eventClassifierCode").alias("eventClassifierCode"),
    pl.col("equipmentEventTypeCode").alias("equipmentEventTypeCode"),
    pl.col("equipmentReference").alias("equipmentReference"),
    pl.col("eventDateTime").alias("eventDateTime"),
    pl.col("eventType").alias("eventType")
])

# Parse string columns as datetime (including fractional seconds)
df_events = df_events.with_columns([
    pl.col("eventCreatedDateTime").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%.f").alias("eventCreatedDateTime"),
    pl.col("eventDateTime").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%.f").alias("eventDateTime")
])

# Convert datetime columns to ISO 8601 strings (without fractional seconds)
df_events = df_events.with_columns([
    pl.col("eventCreatedDateTime").dt.strftime("%Y-%m-%dT%H:%M:%S").alias("eventCreatedDateTime"),
    pl.col("eventDateTime").dt.strftime("%Y-%m-%dT%H:%M:%S").alias("eventDateTime")
])

# Explode document references with eventID initially
df_document_references = (
    df.select([
        pl.col("eventID"),
        pl.col("documentReferences").explode().alias("document_reference")
    ])
    .group_by("eventID", maintain_order=True)
    .agg([
        pl.col("document_reference").struct.field("documentReferenceType").alias("documentReferenceType"),
        pl.col("document_reference").struct.field("documentReferenceValue").alias("documentReferenceValue"),
        pl.col("document_reference").cum_count().cast(pl.Int64).alias("documentIndex")
    ])
    .explode(["documentReferenceType", "documentReferenceValue", "documentIndex"])
)

# Compute SHA-256 hash once using documentReferenceType and documentReferenceValue
df_document_references = df_document_references.with_columns([
    pl.concat_str([pl.col("documentReferenceType"), pl.col("documentReferenceValue")], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate documentReferenceID
df_document_references = df_document_references.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("documentReferenceID")
])

# Remove hashed_col column
df_document_references = df_document_references.drop("hashed_col")

# Join with df_events to add documentReferenceID
df_events = df_events.join(
    df_document_references.select(["eventID", "documentReferenceID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_document_references (Avoid duplicates) after removing eventID
df_document_references = df_document_references.drop("eventID").unique(subset=["documentReferenceID"])

# Reorder columns to ensure documentReferenceID is the first column in df_document_references
df_document_references = df_document_references.select([
    "documentReferenceID",
    *[col for col in df_document_references.columns if col != "documentReferenceID"]
])

# Extract eventLocation
df_event_location = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("eventLocation").struct.field("latitude").cast(pl.Float64).alias("latitude"),
    pl.col("eventLocation").struct.field("longitude").cast(pl.Float64).alias("longitude"),
    pl.col("eventLocation").struct.field("unlocationCode").alias("unlocationCode")
])

# Extract eventTransportCall
df_event_transport_call = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("transportCall").struct.field("transportCallID").alias("transportCallID"),
    pl.col("transportCall").struct.field("carrierServiceCode").alias("carrierServiceCode"),
    pl.col("transportCall").struct.field("exportVoyageNumber").alias("exportVoyageNumber"),
    pl.col("transportCall").struct.field("importVoyageNumber").alias("importVoyageNumber"),
    pl.col("transportCall").struct.field("modeOfTransport").alias("modeOfTransport")
])

# Extract eventAddress
df_event_address = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("eventLocation").struct.field("address").struct.field("stateRegion").alias("stateRegion"),
    pl.col("eventLocation").struct.field("address").struct.field("streetNumber").alias("streetNumber"),
    pl.col("eventLocation").struct.field("address").struct.field("street").alias("street"),
    pl.col("eventLocation").struct.field("address").struct.field("country").alias("country"),
    pl.col("eventLocation").struct.field("address").struct.field("floor").alias("floor"),
    pl.col("eventLocation").struct.field("address").struct.field("city").alias("city"),
    pl.col("eventLocation").struct.field("address").struct.field("postCode").alias("postCode"),
    pl.col("eventLocation").struct.field("address").struct.field("name").alias("name")
])

# Compute SHA-256 hash once using stateRegion, streetNumber, street, country, floor, city, postCode
df_event_address = df_event_address.with_columns([
    pl.concat_str([
        pl.col("stateRegion"),
        pl.col("streetNumber"),
        pl.col("street"),
        pl.col("country"),
        pl.col("floor"),
        pl.col("city"),
        pl.col("postCode")
    ], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate addressID using stateRegion, streetNumber, street, country, floor, city, postCode
df_event_address = df_event_address.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("addressID")
])

# Remove hashed_col column
df_event_address = df_event_address.drop("hashed_col")


# Extract eventFacility
df_event_facility = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("eventLocation").struct.field("facilityCodeListProvider").alias("facilityCodeListProvider"),
    pl.col("eventLocation").struct.field("facilityCode").alias("facilityCode"),
    pl.col("transportCall").struct.field("facilityTypeCode").alias("facilityTypeCode")
])

# Compute SHA-256 hash once facilityID using facilityCodeListProvider, facilityCode & facilityTypeCode
df_event_facility = df_event_facility.with_columns([
    pl.concat_str([
        pl.col("facilityCodeListProvider"),
        pl.col("facilityCode"),
        pl.col("facilityTypeCode")
    ], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate facilityID using facilityCodeListProvider, facilityCode & facilityTypeCode
df_event_facility = df_event_facility.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("facilityID")
])

# Remove hashed_col column
df_event_facility = df_event_facility.drop("hashed_col")


# Extract vessel
df_vessel = df.select([
    pl.col("eventID").alias("eventID"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselCallSignNumber").alias("vesselCallSignNumber"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselFlag").alias("vesselFlag"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselIMONumber").alias("vesselIMONumber"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselName").alias("vesselName"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselOperatorCarrierCode").alias("vesselOperatorCarrierCode"),
    pl.col("transportCall").struct.field("vessel").struct.field("vesselOperatorCarrierCodeListProvider").alias("vesselOperatorCarrierCodeListProvider")
])

# Compute SHA-256 hash once using vesselCallSignNumber, vesselFlag, vesselIMONumber, vesselName, vesselOperatorCarrierCode & vesselOperatorCarrierCodeListProvider
df_vessel = df_vessel.with_columns([
    pl.concat_str([
        pl.col("vesselCallSignNumber"),
        pl.col("vesselFlag"),
        pl.col("vesselIMONumber"),
        pl.col("vesselName"),
        pl.col("vesselOperatorCarrierCode"),
        pl.col("vesselOperatorCarrierCodeListProvider")
    ], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate vesselID using vesselCallSignNumber, vesselFlag, vesselIMONumber, vesselName, vesselOperatorCarrierCode & vesselOperatorCarrierCodeListProvider
df_vessel = df_vessel.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("vesselID")
])

# Remove hashed_col column
df_vessel = df_vessel.drop("hashed_col")

# Join with df_transport_call to add vesselID
df_event_transport_call = df_event_transport_call.join(
    df_vessel.select(["eventID", "vesselID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_vessel (Avoid duplicates) after removing eventID
df_vessel = df_vessel.drop("eventID").unique(subset=["vesselID"])

# Reorder columns to ensure vesselID is the first column in df_vessel
df_vessel = df_vessel.select([
    "vesselID",
    *[col for col in df_vessel.columns if col != "vesselID"]
])

# Join with df_event_transport_call to add facilityID
df_event_transport_call = df_event_transport_call.join(
    df_event_facility.select(["eventID", "facilityID"]),
    on="eventID",
    how="left"
)

# Join with df_event_location to add facilityID
df_event_location = df_event_location.join(
    df_event_facility.select(["eventID", "facilityID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_event_facility (Avoid duplicates) after removing eventID
df_event_facility = df_event_facility.drop("eventID").unique(subset=["facilityID"])

# Reorder columns to ensure facilityID is the first column in df_event_facility
df_event_facility = df_event_facility.select([
    "facilityID",
    *[col for col in df_event_facility.columns if col != "facilityID"]
])

# Join with df_events to add addressID
df_event_location = df_event_location.join(
    df_event_address.select(["eventID", "addressID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_event_address (Avoid duplicates) after removing eventID
df_event_address = df_event_address.drop("eventID").unique(subset=["addressID"])

# Reorder columns to ensure addressID is the first column in df_event_address
df_event_address = df_event_address.select([
    "addressID",
    *[col for col in df_event_address.columns if col != "addressID"]
])

# Join with df_events to add transportCallID (Not Generated as it already exists in the table)
df_events = df_events.join(
    df_event_transport_call.select(["eventID", "transportCallID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_event_transport_call (Avoid duplicates) after removing event
df_event_transport_call = df_event_transport_call.drop("eventID").unique(subset=["transportCallID"])

# Reorder columns to ensure transportCallID is the first column in df_event_transport_call
df_event_transport_call = df_event_transport_call.select([
    "transportCallID",
    *[col for col in df_event_transport_call.columns if col != "transportCallID"]
])

# Compute SHA-256 hash once locationID using latitude, longitude, unlocationCode, facilityID & addressID
df_event_location = df_event_location.with_columns([
    pl.concat_str([
        pl.col("latitude"),
        pl.col("longitude"),
        pl.col("unlocationCode"),
        pl.col("facilityID"),
        pl.col("addressID")
    ], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate locationID using latitude, longitude, unlocationCode, facilityID & addressID
df_event_location = df_event_location.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("locationID")
])


# Remove hashed_col column
df_event_location = df_event_location.drop("hashed_col")

# Join with df_events to add locationID
df_events = df_events.join(
    df_event_location.select(["eventID", "locationID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_event_location (Avoid duplicates) after removing eventID
df_event_location = df_event_location.drop("eventID").unique(subset=["locationID"])

# Reorder columns to ensure locationID is the first column in df_event_location
df_event_location = df_event_location.select([
    "locationID",
    *[col for col in df_event_location.columns if col != "locationID"]
])

# Explode seal with eventID initially
df_seals = (
    df.select([
        pl.col("eventID"),
        pl.col("seals").explode().alias("seal")
    ])
    .group_by("eventID", maintain_order=True)
    .agg([
        pl.col("seal").struct.field("sealNumber").alias("sealNumber"),
        pl.col("seal").struct.field("sealSource").alias("sealSource"),
        pl.col("seal").struct.field("sealType").alias("sealType"),
        pl.col("seal").cum_count().cast(pl.Int64).alias("sealIndex")
    ])
    .explode(["sealNumber", "sealSource", "sealType", "sealIndex"])
)


# Compute SHA-256 hash once sealID using sealNumber, sealSource, sealType with Truncate
df_seals = df_seals.with_columns([
    pl.concat_str([
        pl.col("sealNumber"),
        pl.col("sealSource"),
        pl.col("sealType")
    ], separator="_")
    .map_elements(lambda x: hashlib.sha256(x.encode()).hexdigest(), return_dtype=pl.Utf8).alias("hashed_col")
])

# Generate sealID using sealNumber, sealSource, sealType
df_seals = df_seals.with_columns([
    pl.col("hashed_col").map_elements(lambda x: f"{x[:4]}-{x[4:8]}-{x[8:12]}-{x[12:16]}", return_dtype=pl.Utf8).alias("sealID")
])

# Remove hash_col column
df_seals = df_seals.drop("hashed_col")

# Join with df_event to add sealID
df_events = df_events.join(
    df_seals.select(["eventID", "sealID"]),
    on="eventID",
    how="left"
)

# Ensure uniqueness in df_seals (Avoid duplicates) after removing eventID
df_seals = df_seals.drop("eventID").unique(subset=["sealID"])

# Reorder columns to ensure sealID is the first column in df_seals
df_seals = df_seals.select([
    "sealID",
    *[col for col in df_seals.columns if col != "sealID"]
])

##### Show results

In [37]:
# Set Polars display options
#pl.Config.set_tbl_hide_column_data_types(True)  # Hide data types

In [38]:
# Set the total width of the table display
#pl.Config.set_tbl_width_chars(400)  # Adjust this value as needed # Show full strings without truncation

In [39]:
print(df_events)

shape: (22_000, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ eventID   ┆ eventCrea ┆ eventClas ┆ equipment ┆ … ┆ documentR ┆ transport ┆ locationI ┆ sealID   │
│ ---       ┆ tedDateTi ┆ sifierCod ┆ EventType ┆   ┆ eferenceI ┆ CallID    ┆ D         ┆ ---      │
│ str       ┆ me        ┆ e         ┆ Code      ┆   ┆ D         ┆ ---       ┆ ---       ┆ str      │
│           ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ str       ┆ str       ┆          │
│           ┆ str       ┆ str       ┆ str       ┆   ┆ str       ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 00006ba7- ┆ 2024-06-2 ┆ ACT       ┆ UNLOAD    ┆ … ┆ 1fc1-0abd ┆ 0c582809- ┆ d0ca-2b02 ┆ 50b9-403 │
│ 4179-4b4d ┆ 0T08:12:5 ┆           ┆           ┆   ┆ -c9ce-795 ┆ f5b3-4641 ┆ -3d89-004 ┆ 7-4cf2-d │
│ -a8b9-702 ┆ 9         ┆           ┆           ┆   ┆ 3         ┆ -bf50

In [40]:
print(df_document_references)

shape: (22_000, 4)
┌─────────────────────┬───────────────────────┬─────────────────────────────────┬───────────────┐
│ documentReferenceID ┆ documentReferenceType ┆ documentReferenceValue          ┆ documentIndex │
│ ---                 ┆ ---                   ┆ ---                             ┆ ---           │
│ str                 ┆ str                   ┆ str                             ┆ i64           │
╞═════════════════════╪═══════════════════════╪═════════════════════════════════╪═══════════════╡
│ 8a87-7ffb-45f6-033e ┆ BL                    ┆ 84aec5b3-9492-4208-b152-de6cc7… ┆ 1             │
│ 09a7-3481-9c8b-1c1b ┆ BL                    ┆ 2399f7d5-8bda-4749-944b-280af8… ┆ 1             │
│ d88b-1f3b-4bf8-0436 ┆ BL                    ┆ 1a0935f4-9e9c-4143-b654-427969… ┆ 1             │
│ 7b8f-a048-4436-1f19 ┆ BL                    ┆ 99cc28b1-0260-4a13-a685-19dbbb… ┆ 1             │
│ c77e-674d-ec6e-8850 ┆ BL                    ┆ 4b23a95b-8ee1-4cc2-84b6-a12f5d… ┆ 1             │
│

In [41]:
print(df_seals)

shape: (22_000, 5)
┌─────────────────────┬─────────────────────────────────┬────────────┬──────────┬───────────┐
│ sealID              ┆ sealNumber                      ┆ sealSource ┆ sealType ┆ sealIndex │
│ ---                 ┆ ---                             ┆ ---        ┆ ---      ┆ ---       │
│ str                 ┆ str                             ┆ str        ┆ str      ┆ i64       │
╞═════════════════════╪═════════════════════════════════╪════════════╪══════════╪═══════════╡
│ 0cd8-c993-5aca-3b60 ┆ dc1dfa3b-d227-4183-8f35-8d3b3e… ┆ CUSTOMS    ┆ STANDARD ┆ 1         │
│ 53dd-8762-4623-4818 ┆ 4397b7d3-f45d-431e-8e68-99e06d… ┆ CUSTOMS    ┆ STANDARD ┆ 1         │
│ 0f54-a158-739f-67c0 ┆ 0f60d46c-0ea9-41a3-a9c6-b68ed6… ┆ CARRIER    ┆ STANDARD ┆ 1         │
│ 3541-c68b-3066-fb7b ┆ 69cca339-ce88-41e3-8ca1-b7fed3… ┆ CUSTOMS    ┆ STANDARD ┆ 1         │
│ e7e5-315c-533e-c917 ┆ a897ee1b-cb3b-4268-b3e0-5ea57d… ┆ CUSTOMS    ┆ STANDARD ┆ 1         │
│ …                   ┆ …                

In [42]:
print(df_event_location)

shape: (22_000, 6)
┌──────────────────┬────────────┬────────────┬────────────────┬──────────────────┬─────────────────┐
│ locationID       ┆ latitude   ┆ longitude  ┆ unlocationCode ┆ facilityID       ┆ addressID       │
│ ---              ┆ ---        ┆ ---        ┆ ---            ┆ ---              ┆ ---             │
│ str              ┆ f64        ┆ f64        ┆ str            ┆ str              ┆ str             │
╞══════════════════╪════════════╪════════════╪════════════════╪══════════════════╪═════════════════╡
│ 3162-a740-6ae8-7 ┆ 40.706725  ┆ 29.623694  ┆ XYZ123         ┆ c8b1-05d3-2b7a-e ┆ 667e-68a0-3a08- │
│ 50c              ┆            ┆            ┆                ┆ e8e              ┆ fc8f            │
│ 86dd-744c-f087-f ┆ 71.125286  ┆ -21.725537 ┆ XYZ123         ┆ ebe1-7dd2-8898-0 ┆ a200-e16d-5a61- │
│ d78              ┆            ┆            ┆                ┆ f4f              ┆ 9451            │
│ 0dbf-e7e6-8ec3-b ┆ 29.34552   ┆ -77.149208 ┆ XYZ123         ┆ ebe1-7dd

In [43]:
print(df_event_facility)

shape: (4, 4)
┌─────────────────────┬──────────────────────────┬──────────────┬──────────────────┐
│ facilityID          ┆ facilityCodeListProvider ┆ facilityCode ┆ facilityTypeCode │
│ ---                 ┆ ---                      ┆ ---          ┆ ---              │
│ str                 ┆ str                      ┆ str          ┆ str              │
╞═════════════════════╪══════════════════════════╪══════════════╪══════════════════╡
│ 41d8-f6d8-11d2-cce1 ┆ SMDG                     ┆ XYZ          ┆ PORT             │
│ c8b1-05d3-2b7a-ee8e ┆ SMDG                     ┆ PQR          ┆ PORT             │
│ b29d-5107-a0c0-b7d3 ┆ SMDG                     ┆ QRS          ┆ PORT             │
│ ebe1-7dd2-8898-0f4f ┆ SMDG                     ┆ ADT          ┆ PORT             │
└─────────────────────┴──────────────────────────┴──────────────┴──────────────────┘


In [44]:
print(df_event_address)

shape: (22_000, 9)
┌───────────────┬──────────────┬──────────────┬────────┬───┬───────┬───────────┬──────────┬────────┐
│ addressID     ┆ stateRegion  ┆ streetNumber ┆ street ┆ … ┆ floor ┆ city      ┆ postCode ┆ name   │
│ ---           ┆ ---          ┆ ---          ┆ ---    ┆   ┆ ---   ┆ ---       ┆ ---      ┆ ---    │
│ str           ┆ str          ┆ str          ┆ str    ┆   ┆ str   ┆ str       ┆ str      ┆ str    │
╞═══════════════╪══════════════╪══════════════╪════════╪═══╪═══════╪═══════════╪══════════╪════════╡
│ 2f19-ad02-5c5 ┆ Region B     ┆ 28           ┆ Main   ┆ … ┆ 6F    ┆ London    ┆ 5723     ┆ Henrik │
│ c-ca79        ┆              ┆              ┆ Street ┆   ┆       ┆           ┆          ┆        │
│ cba2-4e60-2fc ┆ N/A          ┆ 175          ┆ Main   ┆ … ┆ 8F    ┆ Berlin    ┆ 9594     ┆ Henrik │
│ 3-c030        ┆              ┆              ┆ Street ┆   ┆       ┆           ┆          ┆        │
│ 5540-6946-ec5 ┆ N/A          ┆ 131          ┆ Main   ┆ … ┆ 9F    ┆ Køb

In [45]:
print(df_event_transport_call)

shape: (22_000, 7)
┌──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ transportCal ┆ carrierServ ┆ exportVoyag ┆ importVoyag ┆ modeOfTrans ┆ vesselID    ┆ facilityID  │
│ lID          ┆ iceCode     ┆ eNumber     ┆ eNumber     ┆ port        ┆ ---         ┆ ---         │
│ ---          ┆ ---         ┆ ---         ┆ ---         ┆ ---         ┆ str         ┆ str         │
│ str          ┆ str         ┆ str         ┆ str         ┆ str         ┆             ┆             │
╞══════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 9a6d9e43-136 ┆ FE1         ┆ 8273S       ┆ 1922N       ┆ SEA         ┆ 98f9-171d-d ┆ c8b1-05d3-2 │
│ 6-4acd-9edc- ┆             ┆             ┆             ┆             ┆ 7a6-264b    ┆ b7a-ee8e    │
│ fcaeac…      ┆             ┆             ┆             ┆             ┆             ┆             │
│ c4009a11-c88 ┆ FE1         ┆ 7464S       ┆ 3664N       ┆ SEA         ┆

In [46]:
print(df_vessel)

shape: (21_933, 7)
┌──────────────┬──────────────┬────────────┬─────────────┬─────────────┬─────────────┬─────────────┐
│ vesselID     ┆ vesselCallSi ┆ vesselFlag ┆ vesselIMONu ┆ vesselName  ┆ vesselOpera ┆ vesselOpera │
│ ---          ┆ gnNumber     ┆ ---        ┆ mber        ┆ ---         ┆ torCarrierC ┆ torCarrierC │
│ str          ┆ ---          ┆ str        ┆ ---         ┆ str         ┆ ode         ┆ odeListP…   │
│              ┆ str          ┆            ┆ str         ┆             ┆ ---         ┆ ---         │
│              ┆              ┆            ┆             ┆             ┆ str         ┆ str         │
╞══════════════╪══════════════╪════════════╪═════════════╪═════════════╪═════════════╪═════════════╡
│ ba60-1275-f8 ┆ NCVV         ┆ DE         ┆ 9240834     ┆ Sea         ┆ MAEU        ┆ NMFTA       │
│ 13-3218      ┆              ┆            ┆             ┆ Guardian    ┆             ┆             │
│ 4df7-30c1-ad ┆ NCVV         ┆ DE         ┆ 9902212     ┆ Wave Rider  ┆

In [47]:
def upsert_to_bigquery(client, table_id, df, unique_key):
    """
    Upserts data into a BigQuery table.

    Args:
        client: BigQuery client.
        table_id: Full table ID in the format 'project_id.dataset_id.table_id'.
        df: Polars DataFrame containing the data to upsert.
        unique_key: The unique key column(s) to match rows for upsert.
    """
    # Convert Polars DataFrame to Pandas DataFrame (BigQuery client works with Pandas)
    pandas_df = df.to_pandas()

    # Create a temporary table ID
    temp_table_id = f"{table_id}_temp"

    # Load the data into a temporary table
    job_config = bigquery.LoadJobConfig(
        autodetect=False,  # Disable autodetect to ensure schema matches
        write_disposition="WRITE_TRUNCATE",  # Overwrite the temporary table
    )

    # Load the DataFrame into the temporary table
    job = client.load_table_from_dataframe(pandas_df, temp_table_id, job_config=job_config)
    job.result()  # Wait for the job to complete

    # Construct the MERGE SQL statement
    columns = pandas_df.columns.tolist()
    set_clause = ", ".join(
        [
            f"{col} = SAFE_CAST(temp.{col} AS TIMESTAMP)"
            if col in ["eventCreatedDateTime", "eventDateTime"]
            else f"{col} = temp.{col}"
            for col in columns
        ]
    )
    insert_columns = ", ".join(columns)
    insert_values = ", ".join(
        [
            f"SAFE_CAST(temp.{col} AS TIMESTAMP)"
            if col in ["eventCreatedDateTime", "eventDateTime"]
            else f"temp.{col}"
            for col in columns
        ]
    )

    merge_sql = f"""
    MERGE `{table_id}` AS target
    USING `{temp_table_id}` AS temp
    ON {' AND '.join([f'target.{key} = temp.{key}' for key in unique_key])}
    WHEN MATCHED THEN
        UPDATE SET {set_clause}
    WHEN NOT MATCHED THEN
        INSERT ({insert_columns}) VALUES ({insert_values})
    """

    # Execute the MERGE statement
    query_job = client.query(merge_sql)
    query_job.result()  # Wait for the job to complete

    # Delete the temporary table
    client.delete_table(temp_table_id, not_found_ok=True)

    logging.info(f"Upserted data into {table_id} successfully!")

In [48]:
# Upsert df_events into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.EquipmentEvent",
    df=df_events,
    unique_key=["eventID"]
)

# Upsert df_document_references into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.DocumentReferences",
    df=df_document_references,
    unique_key=["documentReferenceID"]
)

# Upsert df_event_location into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.EventLocation",
    df=df_event_location,
    unique_key=["locationID"]
)

# Upsert df_event_transport_call into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.TransportCall",
    df=df_event_transport_call,
    unique_key=["transportCallID"]
)

# Upsert df_event_address into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.Address",
    df=df_event_address,
    unique_key=["addressID"]
)

# Upsert df_event_facility into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.Facility",
    df=df_event_facility,
    unique_key=["facilityID"]
)

# Upsert df_vessel into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.Vessel",
    df=df_vessel,
    unique_key=["vesselID"]
)

# Upsert df_seals into BigQuery
upsert_to_bigquery(
    client=client,
    table_id=f"{bigquery_project_id}.{bigquery_dataset_id}.Seals",
    df=df_seals,
    unique_key=["sealID"]
)

2025-03-04 22:23:47,657 - INFO - Upserted data into data-air-452306-e8.Logistics.EquipmentEvent successfully!
2025-03-04 22:23:56,147 - INFO - Upserted data into data-air-452306-e8.Logistics.DocumentReferences successfully!
2025-03-04 22:24:04,344 - INFO - Upserted data into data-air-452306-e8.Logistics.EventLocation successfully!
2025-03-04 22:24:14,711 - INFO - Upserted data into data-air-452306-e8.Logistics.TransportCall successfully!
2025-03-04 22:24:50,720 - INFO - Upserted data into data-air-452306-e8.Logistics.Address successfully!
2025-03-04 22:24:58,203 - INFO - Upserted data into data-air-452306-e8.Logistics.Facility successfully!
2025-03-04 22:25:07,004 - INFO - Upserted data into data-air-452306-e8.Logistics.Vessel successfully!
2025-03-04 22:25:17,554 - INFO - Upserted data into data-air-452306-e8.Logistics.Seals successfully!
