# Notebook containing all (defined) needed functions 

In [0]:
from azure.storage.blob import BlobServiceClient


def mounting_all_containers(storage_account,  key): 
    # Connect to Azure Blob Storage
    blob_service_client = BlobServiceClient(
        f"https://{storage_account}.blob.core.windows.net", credential=key
    )

    # Get all containers
    containers = [container.name for container in blob_service_client.list_containers()]

    # Mount ADLS only if the container is not already mounted
    for container in containers:
        source = f"wasbs://{container}@{storage_account}.blob.core.windows.net/"
        mount_folder = f"/mnt/{storage_account}/{container}"  
        config = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net" : key}

        if not any(mount.mountPoint == mount_folder for mount in dbutils.fs.mounts()): 
            try:
                dbutils.fs.mount(
                    source = source,
                    mount_point = mount_folder,
                    extra_configs = config)
                print(f"Mount to {mount_folder} succeeded!")
            except Exception as e:    
                print(f"Mount to {mount_folder} failed: {e}")

In [0]:
from pyspark.sql.functions import date_trunc, from_utc_timestamp, current_timestamp

def add_ingestion_date(input_df):
    output_df = input_df.withColumn("ingestion_date", date_trunc("second", 
                                        from_utc_timestamp(current_timestamp(), "Europe/Amsterdam")
                                        ))
    return output_df

In [0]:
def rearrange_partition_column(input_df, partition_column): # move the partition column to the end of the column list
    column_list = []
    for column_name in input_df.schema.names:
        if column_name != partition_column:
            column_list.append(column_name)

    column_list.append(partition_column)

    output_df = input_df.select(column_list)

    return output_df



In [0]:
def overwrite_partition(input_df, db_name, table_name, partition_column): # apply inplace
    output_df = rearrange_partition_column(input_df, partition_column)

    # set to dynamic partitioning
    spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") # when insertInto runs, it's going to find the partitions and only replace those partitions with the new data received (i.e., not overwrite the entire table)

    if (spark._jsparkSession.catalog().tableExists(f"{db_name}.{table_name}")):
        # append any subsequent data
        output_df.write.mode("overwrite").insertInto(f"{db_name}.{table_name}") # based on the last column to be the partitioned column according to spark
    else:
        # incremental load for the first time
        output_df.write.mode("overwrite").partitionBy(partition_column).format("parquet").saveAsTable(f"{db_name}.{table_name}")



In [0]:
def df_column_to_list(input_df, column_name):
    df_row_list = input_df.select(column_name).distinct().collect()

    column_value_list = [row[column_name] for row in df_row_list]

    return column_value_list
