### Date Range

In [None]:
import yaml

with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

if config['injection_schedule']['is_weekly']:
    date_range = 7

In [None]:
# Calculate the date range for the last 7 days
from datetime import datetime, timedelta

# Add in correlationid to join search and click, tracking from 3/18/2025
# Click data has issues after 2025/4/19 (inclusive)
end_date = datetime.now() - timedelta(days=1)
# end_date = datetime(2025, 4, 18)
start_date = max(datetime(2025, 3, 18), end_date - timedelta(days=date_range))
date_range = (end_date - start_date).days
print(f"Start date: {start_date}")
print(f"End date: {end_date}")
print(f"Date range: {date_range}")

### ML Search 

In [0]:

import os

# Generate the list of file paths in the date range
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")
ml_search_volume = dbutils.widgets.get("ml_search_volume")

base_path = f"/Volumes/{ml_catalog}/{ml_search_db}/{ml_search_volume}"
file_paths = []
for single_date in (start_date + timedelta(n) for n in range(date_range)):
    year = single_date.strftime("%Y")
    month = single_date.strftime("%m")
    day = single_date.strftime("%d")
    file_path = f"{base_path}/year={year}/month={month}/day={day}/"
    # Check if the folder exists before appending
    if os.path.exists(os.path.dirname(file_path)):
        file_paths.append(file_path+"*/*.parquet")
    else:
        print(f"Folder does not exist: {os.path.dirname(file_path)}")
else:
    year = end_date.strftime("%Y")
    month = end_date.strftime("%m")
    day = end_date.strftime("%d")
    file_path = f"{base_path}/year={year}/month={month}/day={day}/"
    # Check if the folder exists before appending
    if os.path.exists(os.path.dirname(file_path)):
        file_paths.append(file_path+"*/*.parquet")
    else:
        print(f"Folder does not exist: {os.path.dirname(file_path)}")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("user_agent", StringType(), True),
    StructField("uuid1_time", StringType(), True),
    StructField("_token_associate_id", StringType(), True),
    StructField("label", StringType(), True),
    StructField("_token_client_id", StringType(), True),
    StructField("object_id", StringType(), True),
    StructField("_token_session_id", StringType(), True),
    StructField("context", StringType(), True),
    StructField("athena_tablename", StringType(), True),
    StructField("time_stamp", StringType(), True),
    StructField("_token_person_id", StringType(), True),
    StructField("action", StringType(), True),
    StructField("os", StringType(), True),
    StructField("browser", StringType(), True),
    StructField("schema_version", StringType(), True),
    StructField("_token_mask", StringType(), True),
    StructField("date_key", StringType(), True),
    StructField("client_id", StringType(), True),
    StructField("tile_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("timezone", StringType(), True),
    StructField("event_id", StringType(), True),
    StructField("details_caption", StringType(), True),
    StructField("technical_mini_app_version", StringType(), True),
    StructField("technical_mini_app", StringType(), True),
    StructField("request_correlation_id", StringType(), True),
    StructField("details_search_value", StringType(), True),
    StructField("details_search_results", StringType(), True)
])

df = spark.read.format("parquet").schema(schema).load(file_paths)
df.printSchema()

In [None]:
dbutils.data.summarize(df)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_search")

(df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_search"))

### ML Click

In [None]:
import os

# Generate the list of file paths in the date range
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")
ml_search_click_volume = dbutils.widgets.get("ml_search_click_volume")

base_path = f"/Volumes/{ml_catalog}/{ml_search_db}/{ml_search_click_volume}"
file_paths = []
for single_date in (start_date + timedelta(n) for n in range(date_range)):
    year = single_date.strftime("%Y")
    month = single_date.strftime("%m")
    day = single_date.strftime("%d")
    file_path = f"{base_path}/year={year}/month={month}/day={day}/"
    # Check if the folder exists before appending
    if os.path.exists(os.path.dirname(file_path)):
        file_paths.append(file_path+"*/*.parquet")
    else:
        print(f"Folder does not exist: {os.path.dirname(file_path)}")
else:
    year = end_date.strftime("%Y")
    month = end_date.strftime("%m")
    day = end_date.strftime("%d")
    file_path = f"{base_path}/year={year}/month={month}/day={day}/"
    # Check if the folder exists before appending
    if os.path.exists(os.path.dirname(file_path)):
        file_paths.append(file_path+"*/*.parquet")
    else:
        print(f"Folder does not exist: {os.path.dirname(file_path)}")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("user_agent", StringType(), True),
    StructField("uuid1_time", StringType(), True),
    StructField("_token_associate_id", StringType(), True),
    StructField("_token_client_id", StringType(), True),
    StructField("details_behavior_instance_id", StringType(), True),
    StructField("details_tile_id", StringType(), True),
    StructField("details_br_count", StringType(), True),
    StructField("details_correlation_id", StringType(), True),
    StructField("details_trace_ids", StringType(), True),
    StructField("_token_session_id", StringType(), True),
    StructField("athena_tablename", StringType(), True),
    StructField("details_block_id", StringType(), True),
    StructField("details_associate_id", StringType(), True),
    StructField("details_timestamp", StringType(), True),
    StructField("_token_person_id", StringType(), True),
    StructField("details_error", StringType(), True),
    StructField("details_value", StringType(), True),
    StructField("os", StringType(), True),
    StructField("browser", StringType(), True),
    StructField("schema_version", StringType(), True),
    StructField("_token_mask", StringType(), True),
    StructField("date_key", StringType(), True),
    StructField("client_id", StringType(), True),
    StructField("details_business_rule_id", StringType(), True),
    StructField("details_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("timezone", StringType(), True),
    StructField("event_id", StringType(), True),
    StructField("type_id", StringType(), True),
    StructField("details_object_id", StringType(), True),
    StructField("label", StringType(), True),
    StructField("details_context", StringType(), True),
    StructField("details_type_id", StringType(), True),
    StructField("object_id", StringType(), True),
    StructField("caption", StringType(), True),
    StructField("context", StringType(), True),
    StructField("technical_mini_app_version", StringType(), True),
    StructField("technical_mini_app", StringType(), True),
    StructField("details_technical_mini_app", StringType(), True),
    StructField("action", StringType(), True),
    StructField("path", StringType(), True),
    StructField("non_interaction", StringType(), True),
    StructField("tile_id", StringType(), True),
    StructField("details_caption", StringType(), True),
    StructField("details_technical_mini_app_version", StringType(), True),
    StructField("details_tab_block_id", StringType(), True),
    StructField("details_opened_as", StringType(), True),
    StructField("details_tab_block_name", StringType(), True),
    StructField("details_is_impure", StringType(), True),
    StructField("details_variable", StringType(), True),
    StructField("details_tile_caption", StringType(), True),
    StructField("details_page_load_id", StringType(), True),
    StructField("details_modal_caption", StringType(), True),
    StructField("details_modal_id", StringType(), True),
    StructField("time_stamp", StringType(), True),
    StructField("business_mini_app_versions", StringType(), True),
    StructField("business_mini_apps", StringType(), True),
    StructField("request_correlation_id", StringType(), True),
    StructField("details_search_value", StringType(), True),
    StructField("details_search_results", StringType(), True)
])

df = spark.read.format("parquet").schema(schema).load(file_paths)
df.printSchema()

In [None]:
dbutils.data.summarize(df)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_search_click")

(df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_search_click"))

### ML Mobile

In [None]:
# Mobile collected sparse data:
end_date = datetime(2025, 2, 19)
print(f"End date: {end_date}")

In [None]:
# Collect the monthly date
# Generate the list of file paths in the date range
ml_catalog = dbutils.widgets.get("ml_catalog")
ml_search_db = dbutils.widgets.get("ml_search_db")
ml_search_mobile_events_volume = dbutils.widgets.get("ml_search_mobile_events_volume")

base_path = f"/Volumes/{ml_catalog}/{ml_search_db}/{ml_search_mobile_events_volume}"

year = end_date.strftime("%Y")
month = end_date.strftime("%m")
file_path = f"{base_path}/year={year}/month={month}/*/*/*.parquet"
file_paths.append(file_path)

In [None]:
df = spark.read.format("parquet").load(file_paths)
df.printSchema()

In [None]:
dbutils.data.summarize(df)

In [None]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.sql(f"DROP TABLE IF EXISTS {ml_catalog}.{ml_search_db}.ml_search_mobile")

(df
.write
.format("delta")
.mode("overwrite")
.option("mergeSchema", "true")
.saveAsTable(f"{ml_catalog}.{ml_search_db}.ml_search_mobile"))