In [1]:
import pandas as pd
import sys

sys.path.append('../')
logs_table = pd.read_json("data/sdq_test_data_2025-02-14.jsonl", lines=True)

In [None]:
import base64
import ast

sample = ast.literal_eval(base64.b64decode(logs_table.iloc[0]["captureData"]["endpointInput"]['data']).decode("utf-8"))

In [None]:
df_sdq = spark.createDataFrame(logs_table["captureData"].apply(lambda x: x["endpointInput"]["data"]), ["encoded_list"])
display(df_sdq)

In [None]:
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_sdq_db = dbutils.widgets.get("ml_sdq_db")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType
import pyspark.sql.functions as F
from pyspark.sql.functions import expr

# Define the schema for the JSON objects
schema = StructType([
                StructField('session_id', StringType(), nullable=False), 
                StructField('timestamp',  StringType(), nullable=False),
                StructField('client_id',  StringType(), nullable=False),
                StructField('associate_id',  StringType(), nullable=False),
                StructField('trace_id',  StringType(), nullable=False),
                StructField('dq_id',  StringType(), nullable=False),
                StructField('params',  StringType(), nullable=False)
])

# Explode the list of JSON objects
res = df_sdq.withColumn("encoded_list_extract_sdq", expr(f"{ml_catalog}.{ml_sdq_db}.literal_eval_sdq(encoded_list)"))
exploded_df = res.withColumn("decoded_list", F.explode(F.col("encoded_list_extract_sdq")))
parsed_df = exploded_df.withColumn("decoded_list", F.from_json(F.col("decoded_list"), schema))
final_df = parsed_df.select(*[col for col in res.columns if col != "encoded_list_extract_sdq"], "decoded_list.*")

In [None]:
display(final_df)

In [None]:
final_df.printSchema()

In [None]:
from pyspark.sql import functions as F

yesterday = F.date_sub(F.current_date(), 1)

# Add year, month, and day columns to the DataFrame
final_df = final_df.withColumn("year", F.year(F.lit(yesterday))) \
                   .withColumn("month", F.year(F.lit(yesterday))) \
                   .withColumn("day", F.year(F.lit(yesterday)))

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_sdq_db}.lifion_cacheable_dq_sample")

final_df.write.format("delta").mode("overwrite").partitionBy("year", "month", "day").saveAsTable(f"{ml_catalog}.{ml_sdq_db}.lifion_cacheable_dq_sample")

In [None]:
secret_scope = dbutils.widgets.get("secret_scope")

if secret_scope.split("-")[0] == "dit":
    dbutils.notebook.exit("No need to run the following code in DIT environment")

In [2]:
%md

### Investigate the data in Production



### Investigate the data in Production

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
import uuid
from datetime import datetime

In [None]:
logs_table = spark.sql("select * from nas_raw_lifion_prod.lifion_logs_for_sdq")

In [None]:
logs_table.printSchema()

In [None]:
display(logs_table)

In [None]:
raw_sagemaker_data = spark.read.table("nas_raw_lifion_prod.lifion_sagemaker_raw")

In [None]:
raw_sagemaker_data.printSchema()

In [None]:
display(raw_sagemaker_data)

In [None]:
raw_sagemaker_data = spark.read.table("nas_raw_lifion_prod.lifion_sagemaker_raw_prod_use1")

In [None]:
raw_sagemaker_data.printSchema()

In [None]:
display(raw_sagemaker_data)

In [None]:
dq_only_data_table = spark.read.table("nas_raw_lifion_prod.lifion_sdq_params")

In [None]:
dq_only_data_table.printSchema()

In [None]:
display(dq_only_data_table)

In [None]:
dq_only_data_table = spark.read.table("nas_raw_lifion_prod.lifion_sdq_params_prod_use1")

In [None]:
dq_only_data_table.printSchema()

In [None]:
display(dq_only_data_table)

In [None]:
cacheable_dq = spark.read.table("nas_raw_lifion_prod.lifion_cacheable_dq")

In [None]:
cacheable_dq.printSchema()

In [None]:
display(cacheable_dq)