In [0]:
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 

In [0]:
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")

# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"


# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")


# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
SAS_Token = dbutils.secrets.get(KeyVault_name, "ARIATD-SAS-TOKEN")
sub_dir = 'ARIATD'
account_url = "https://a360c2x2555dz.blob.core.windows.net"
container_name = "dropzone"
mnt_name = "ariatd"  # mnt name specified as ariatd
mount_point = f"/mnt/{mnt_name}"

# Drop mount if exists, then recreate
if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
    dbutils.fs.unmount(mount_point)

dbutils.fs.mount(
    source=f"wasbs://{container_name}@a360c2x2555dz.blob.core.windows.net/{sub_dir}",
    mount_point=mount_point,
    extra_configs={f"fs.azure.sas.{container_name}.a360c2x2555dz.blob.core.windows.net": SAS_Token}
)

In [0]:
from pyspark.sql.functions import from_unixtime, col

df = spark.createDataFrame(dbutils.fs.ls("/mnt/ariatd/submission"))
df = df.withColumn("modificationTime_dt", from_unixtime(col("modificationTime") / 1000))
display(df)

In [0]:
df.count()

In [0]:
from pyspark.sql.functions import from_unixtime, col, lower, when, max as spark_max, count

df = spark.createDataFrame(dbutils.fs.ls("/mnt/ariatd/submission"))
df = df.withColumn("modificationTime_dt", from_unixtime(col("modificationTime") / 1000))

# Count files by extension (.html, .json, .a360) and get max modification date per extension
ext_col = lower(col("name"))
ext_counts = (
    df.withColumn(
        "extension",
        when(ext_col.endswith(".html"), ".html")
        .when(ext_col.endswith(".json"), ".json")
        .when(ext_col.endswith(".a360"), ".a360")
    )
    .filter(col("extension").isNotNull())
    .groupBy("extension")
    .agg(
        spark_max("modificationTime_dt").alias("max_modificationTime_UTC"),
        col("extension"),
        count("*").alias("count")
    )
    .select("extension", "count", "max_modificationTime_UTC")
)

display(ext_counts)

In [0]:

from pyspark.sql.functions import *

########################################################
# Identify env and KeyVault_name
#######################################################

config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()

print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted

KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}") 
######################################
RECORD_CLASS = 'ARIATD'
#######################################################

SAS_Token = dbutils.secrets.get(KeyVault_name, f"{RECORD_CLASS}-SAS-TOKEN")
storage_account_name = "a360c2x2555dz"
container_name = "dropzone"
sub_dir = f"{RECORD_CLASS}/submission"

input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{sub_dir}"

spark.conf.set(
    f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net",
    SAS_Token
)

df = spark.read.format("binaryFile").load(input_path)
display(df)

# Count files by extension (.html, .json, .a360) and get max modification date per extension
ext_col = lower(col("path"))
ext_counts = (
    df.withColumn(
        "extension",
        when(ext_col.endswith(".html"), ".html")
        .when(ext_col.endswith(".json"), ".json")
        .when(ext_col.endswith(".a360"), ".a360")
    )
    .filter(col("extension").isNotNull())
    .groupBy("extension")
    .agg(
        max("modificationTime").alias("max_modificationTime_UTC"),
        col("extension"),
        count("*").alias("count")
    )
    .select("extension", "count", "max_modificationTime_UTC")
)

display(ext_counts)

In [0]:
# Count files by extension (.html, .json, .a360) and get max modification date per extension
ext_col = lower(col("path"))
ext_counts = (
    df.withColumn(
        "extension",
        when(ext_col.endswith(".html"), ".html")
        .when(ext_col.endswith(".json"), ".json")
        .when(ext_col.endswith(".a360"), ".a360")
    )
    .filter(col("extension").isNotNull())
    .groupBy("extension")
    .agg(
        spark_max("modificationTime").alias("max_modificationTime_UTC"),
        col("extension"),
        count("*").alias("count")
    )
    .select("extension", "count", "max_modificationTime_UTC")
)

display(ext_counts)