In [1]:
import sys
import os

# Add parent directory to sys.path for module imports
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print(f"Current working directory: {os.getcwd()}")
print(f"Parent directory added to path: {parent_dir}")

from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, LongType, ArrayType, MapType
from config import load_config
from pyspark.sql.functions import (
    col,
    from_json,
    sha2,
    concat,
    from_unixtime,
    date_format,
    when,
    lit,
    to_json
)

# Import utils with error handling
try:
    from utils import get_udfs
    print("✅ Successfully imported utils module")
except ImportError as e:
    print(f"❌ Failed to import utils: {e}")
    print("Available modules in sys.path:")
    for path in sys.path:
        print(f"  - {path}")

Current working directory: /home/sonhaile/Data-Engineering-Journey/Le_Son_LV2_Project_02/notebooks
Parent directory added to path: /home/sonhaile/Data-Engineering-Journey/Le_Son_LV2_Project_02
✅ Successfully imported utils module
✅ Successfully imported utils module


In [2]:
kafka_conf = load_config(filename='../config.ini',section = 'remote_kafka')

In [3]:
spark = SparkSession.builder \
    .appName("SimpleApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

25/09/27 17:23:27 WARN Utils: Your hostname, sonhaile-ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.101.43 instead (on interface enp5s0)
25/09/27 17:23:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/sonhaile/.ivy2/cache
The jars for the packages stored in: /home/sonhaile/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-502b0fd2-743c-4034-9715-c7de365277e9;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 209ms :: artifacts dl 6ms


25/09/27 17:23:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def transform(df, schema):
    """Transform raw Kafka data using Spark operations for better performance."""

    # Parse JSON and extract data
    df = df.select(
        from_json(col("value").cast(StringType()), schema).alias("data")
    ).select("data.*")

    # Transform data in Spark for better performance
    # 1. Convert option array to JSON string
    df = df.withColumn("option", to_json(col("option")))

    # 2. Generate timestamp-based columns
    df = df.withColumn(
        "timestamp_dt", from_unixtime(col("time_stamp")))
    df = df.withColumn(
        "full_date", date_format(col("timestamp_dt"), "yyyy-MM-dd"))
    df = df.withColumn(
        "full_time", date_format(col("timestamp_dt"), "HH:mm:ss"))

    # 3. Generate hash keys
    df = df.withColumn(
        "sales_key",
        sha2(concat(col("id"), col("product_id")), 256)
    )
    df = df.withColumn(
        "ip_key",
        when(col("ip").isNotNull(), sha2(
            col("ip"), 256)).otherwise(lit(None))
    )
    df = df.withColumn(
        "user_agent_key",
        when(col("user_agent").isNotNull(), sha2(
            col("user_agent"), 256)).otherwise(lit(None))
    )

    # 4. Convert product_id to product_key as string
    df = df.withColumn(
        "product_key",
        when(col("product_id").isNotNull(), col(
            "product_id").cast("string")).otherwise(lit(None))
    )

    # 5. UDF transformations - with error handling
    try:
        # Clean referrer_url to extract domain using UDF
        extract_referrer_domain_udf = get_udfs(udf_name='extract_referrer_domain')
        df = df.withColumn(
            "referrer_url",
            extract_referrer_domain_udf(col("referrer_url"))
        )

        # Extract browser and OS from user_agent using UDFs
        extract_browser_udf = get_udfs(udf_name='extract_browser')
        extract_os_udf = get_udfs(udf_name='extract_os')
        df = df.withColumn(
            "browser",
            extract_browser_udf(col("user_agent"))
        )
        df = df.withColumn(
            "os",
            extract_os_udf(col("user_agent"))
        )
        print("✅ UDF transformations applied successfully")
        
    except NameError:
        print("⚠️ get_udfs function not available - skipping UDF transformations")
        # Add placeholder columns for missing UDF results
        df = df.withColumn("browser", lit("Unknown"))
        df = df.withColumn("os", lit("Unknown"))
        # Keep referrer_url as is if UDF not available

    return df

print("Transform function defined successfully")

Transform function defined successfully


In [5]:
# Alternative approach using foreachBatch for better compatibility
def process_batch(df, epoch_id):
    """Process each micro-batch of data."""
    if df.count() > 0:
        # Convert to pandas DataFrame
        pandas_df = df.toPandas()
        
        # Print transformed data (for testing)
        print(f"Batch {epoch_id} - Transformed Data Sample:")
        print(pandas_df.head(3))
        print(f"Total records in batch: {len(pandas_df)}")
        print("---")

# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .options(**kafka_conf) \
    .load()

schema = StructType([
    StructField("_id", StringType()),
    StructField("time_stamp", LongType()),
    StructField("ip", StringType()),
    StructField("user_agent", StringType()),
    StructField("resolution", StringType()),
    StructField("user_id_db", StringType()),
    StructField("device_id", StringType()),
    StructField("api_version", StringType()),
    StructField("store_id", StringType()),
    StructField("local_time", StringType()),
    StructField("show_recommendation", StringType()),
    StructField("current_url", StringType()),
    StructField("referrer_url", StringType()),
    StructField("email_address", StringType()),
    StructField("recommendation", StringType()),
    StructField("utm_source", StringType()),
    StructField("utm_medium", StringType()),
    StructField("collection", StringType()),
    StructField("product_id", StringType()),
    StructField("option", ArrayType(MapType(StringType(), StringType()))),
    StructField("id", StringType())
])

parsed_df = transform(df, schema)

# Apply transform function via foreachBatch (more compatible)
query = parsed_df.writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", "/tmp/spark_checkpoints/test_notebook1") \
    .trigger(processingTime="2 seconds") \
    .start() # processingTime="2 seconds" OR once = True 

print("Streaming pipeline with transform started. Press Ctrl+C to stop.")
query.awaitTermination()

✅ UDF transformations applied successfully
25/09/27 17:23:43 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
Streaming pipeline with transform started. Press Ctrl+C to stop.
25/09/27 17:23:44 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/09/27 17:23:44 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/09/27 17:23:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:23:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:23:45 WA



25/09/27 17:24:14 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


                                                                                

25/09/27 17:24:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:15 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


[Stage 2:>                                                          (0 + 2) / 2]

25/09/27 17:24:28 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894


                                                                                

Batch 0 - Transformed Data Sample:
                        _id  time_stamp              ip  \
0  5e9348b18000343760ae1df0  1758967982  86.130.129.180   
1  5e9348b187725937120910ee  1758967982  188.146.96.166   
2  5e9348b12c94b7370cbf05bc  1758967982    5.173.40.106   

                                          user_agent resolution user_id_db  \
0  Mozilla/5.0 (Linux; Android 10; MAR-LX1A) Appl...    360x771     482410   
1  Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...   1280x720              
2  Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_5 like...    375x667              

                              device_id api_version store_id  \
0  675efe1e-93d9-461f-a019-0127831b5ea2         1.0        7   
1  aae7a5ff-a378-43cb-8e5a-4b27e7e8fc52         1.0       50   
2  3a6fed92-b0c0-46fd-be30-33535eab061f         1.0       50   

            local_time  ...                                    id  \
0  2025-09-27 17:13:02  ...  24a6a81e-53ed-427d-a17d-b4f29bcaec21   
1  2025-09-27 17:13:

[Stage 3:>                                                          (0 + 3) / 3]

25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/09/27 17:24:56 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/socket.py", line 716, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


25/09/27 17:24:56 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 5)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 224, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/home/sonhaile/miniconda3/envs/data_engineering/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 145, in dump_stream
    for obj in iterator:
  File "/home/sonhaile/miniconda3/envs/data_engineerin

KeyboardInterrupt: 