In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import sys
import os
sys.path.append(os.path.abspath('./odibi_de_v2'))
os.environ["PYTHONDONTWRITEBYTECODE"] = "1"
from odibi_de_v2.transformer import SQLTransformerFromConfig, SparkEventSplitter
from odibi_de_v2.config import ConfigUtils

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql import functions as F
from datetime import datetime

# 1. Sample data
data = [
    ("E1", datetime.strptime("2023-09-01 18:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2023-09-03 06:00:00", "%Y-%m-%d %H:%M:%S")),  # Spans across 2+ days
    ("E2", datetime.strptime("2023-09-02 08:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2023-09-02 12:00:00", "%Y-%m-%d %H:%M:%S")),  # Same day
    ("E3", datetime.strptime("2023-09-03 01:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2023-09-03 07:00:00", "%Y-%m-%d %H:%M:%S"))   # Same day but crosses anchor_time
]

schema = StructType([
    StructField("Event_ID", StringType(), True),
    StructField("Start_Time", TimestampType(), True),
    StructField("End_Time", TimestampType(), True)
])

spark = SparkSession.builder.getOrCreate()

events_df = spark.createDataFrame(data, schema)
display(events_df)

# 2. Instantiate your splitter (e.g., split every 1 day starting from 06:00:00)
splitter = SparkEventSplitter(
    start_time_col="Start_Time",
    end_time_col="End_Time",
    interval_value=1,
    interval_unit="day",
    anchor_time="00:00:00"
)

# 3. Apply transformation
result_df = splitter.transform(events_df)

# 4. Show result
display(result_df.orderBy("Event_ID", "Start_Time"))