In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as F

from src.engagement_analysis_processor import EngagementAnalysisProcessor 
from src.user_engagement_dq import UserEngagementDQ


#### Case 1: Calculate  Average Duration Per Page & Most engaging page (invalid rows are included)

In [0]:

schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("timestamp", StringType(), False), 
    StructField("page", StringType(), False),
    StructField("duration_seconds", IntegerType(), False),
])

df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .schema(schema) \
    .load("dbfs:/FileStore/gore_csv/user_engagement_invalid.csv")

display(df)



In [0]:


P = EngagementAnalysisProcessor()

display( P.avg_duration_per_page(df))
display( P.most_engaging_page(df))


#### Case 2: Calculate  Average Duration Per Page & Most engaging page (invalid rows are excluded)

In [0]:


dq = UserEngagementDQ(df)

dq.test_required_not_null_and_ranges_and_domain()
dq.test_timestamp_parseable()
dq.log_invalid_records()

df_good = dq.get_df_with_valid_rows()


display( P.avg_duration_per_page(df_good))
display( P.most_engaging_page(df_good))


####  Validate Logs

In [0]:


df_logs = spark.read \
        .format("parquet") \
        .option("header", "true") \
        .load("dbfs:/FileStore/gore_logs/*")

display(df_logs.orderBy(F.col("__ingest_datetime").desc()))      




In [0]:
# dbutils.fs.rm("dbfs:/FileStore/gore_logs/", True)
