In [None]:
import sys
sys.path.append("..")  # Add parent directory to sys.path for module imports
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StringType, StructType, StructField, LongType, ArrayType, MapType
from config import load_config
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType, MapType
import pandas as pd
import hashlib
from utils import extract_browser, extract_os

In [None]:
kafka_conf = load_config(filename='../config.ini',section = 'remote_kafka')

In [None]:
spark = SparkSession.builder \
    .appName("SimpleApp") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .getOrCreate()

In [None]:
def transform(df):
    """Transform raw data by adding computed columns."""
    
    # Convert timestamp to datetime (assuming seconds since epoch)
    df['timestamp_dt'] = pd.to_datetime(df['time_stamp'], unit='s')
    
    # Create date and time columns
    df['full_date'] = df['timestamp_dt'].dt.strftime('%Y-%m-%d')
    df['full_time'] = df['timestamp_dt'].dt.strftime('%H:%M:%S')
    
    # Create hash keys
    df['sales_key'] = df.apply(lambda row: hashlib.sha256(f"{row['id']}{row['product_id']}".encode()).hexdigest(), axis=1)
    df['ip_key'] = df['ip'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest() if pd.notna(x) else None)
    df['user_agent_key'] = df['user_agent'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest() if pd.notna(x) else None)
    df['product_key'] = df['product_id']
    
    # Extract browser and OS from user_agent
    df['browser'] = df['user_agent'].apply(extract_browser)
    df['os'] = df['user_agent'].apply(extract_os)
    
    # Convert option array to string
    df['option'] = df['option'].apply(lambda x: str(x) if x is not None else None)
    
    return df

In [None]:
from pyspark.sql.streaming import StreamingQuery

# Alternative approach using foreachBatch for better compatibility
def process_batch(df, epoch_id):
    """Process each micro-batch of data."""
    if df.count() > 0:
        # Convert to pandas DataFrame
        pandas_df = df.toPandas()
        
        # Apply transform function
        transformed_df = transform(pandas_df)
        
        # Print transformed data (for testing)
        print(f"Batch {epoch_id} - Transformed Data Sample:")
        print(transformed_df.head(3))
        print(f"Total records in batch: {len(transformed_df)}")
        print("---")

# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .options(**kafka_conf) \
    .load()

schema = StructType([
    StructField("_id", StringType()),
    StructField("time_stamp", LongType()),
    StructField("ip", StringType()),
    StructField("user_agent", StringType()),
    StructField("resolution", StringType()),
    StructField("user_id_db", StringType()),
    StructField("device_id", StringType()),
    StructField("api_version", StringType()),
    StructField("store_id", StringType()),
    StructField("local_time", StringType()),
    StructField("show_recommendation", StringType()),
    StructField("current_url", StringType()),
    StructField("referrer_url", StringType()),
    StructField("email_address", StringType()),
    StructField("recommendation", StringType()),
    StructField("utm_source", StringType()),
    StructField("utm_medium", StringType()),
    StructField("collection", StringType()),
    StructField("product_id", StringType()),
    StructField("option", ArrayType(MapType(StringType(), StringType()))),
    StructField("id", StringType())
])

# Parse JSON data
parsed_df = df.selectExpr("CAST(value AS STRING) as json_value") \
    .select(from_json(col("json_value"), schema).alias("data")) \
    .select("data.*")

# Apply transform function via foreachBatch (more compatible)
query = parsed_df.writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", "/tmp/spark_checkpoints/test_notebook") \
    .trigger(processingTime="2 seconds") \
    .start()

print("Streaming pipeline with transform started. Press Ctrl+C to stop.")
query.awaitTermination()