In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import (StructType, StructField, StringType, DoubleType,
                               IntegerType, BooleanType, TimestampType)

# Schema that mirrors your JSON
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("sender_account", StringType(), True),
    StructField("receiver_account", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("merchant_category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("device_used", StringType(), True),
    StructField("is_fraud", StringType(), True),
    StructField("fraud_type", StringType(), True),
    StructField("time_since_last_transaction", StringType(), True),
    StructField("spending_deviation_score", StringType(), True),
    StructField("velocity_score", StringType(), True),
    StructField("geo_anomaly_score", StringType(), True),
    StructField("payment_channel", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("device_hash", StringType(), True)
])

spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/ckpt")




In [0]:
from pyspark.sql.functions import col, to_timestamp

casted_df = parsed_df \
    .withColumn("timestamp", to_timestamp("timestamp")) \
    .withColumn("amount", col("amount").cast("double")) \
    .withColumn("time_since_last_transaction", col("time_since_last_transaction").cast("double")) \
    .withColumn("spending_deviation_score", col("spending_deviation_score").cast("double")) \
    .withColumn("velocity_score", col("velocity_score").cast("double")) \
    .withColumn("geo_anomaly_score", col("geo_anomaly_score").cast("double")) \
    .withColumn("is_fraud", col("is_fraud").cast("boolean"))


In [0]:
# Read from Kafka
raw_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafkainterface.servebeer.com:9092")
    .option("subscribe", "streaming")
    .load())

In [0]:
# Parse JSON → struct → columns
parsed_df = (raw_df
    .select(from_json(col("value").cast("string"), schema).alias("data"))
    .select("data.*"))          # flatten so each key is a top-level column


In [0]:
dbutils.fs.rm("dbfs:/tmp/ckpt/parsed_data", recurse=True)

Out[49]: True

In [0]:
# Stream to console (tabular like CSV)
query = parsed_df.writeStream \
    .format("console") \
    .queryName("parsed_data") \
    .outputMode("append") \
    .start()



In [0]:
final_df = casted_df

In [0]:
query = final_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/ckpt/final_df") \
    .start("/mnt/datalake/final_df_output")


In [0]:
spark.read.format("delta").load("/mnt/datalake/final_df_output").show()


+--------------+--------------------+--------------+----------------+-------+----------------+-----------------+---------+-----------+--------+----------+---------------------------+------------------------+--------------+-----------------+---------------+---------------+-----------+
|transaction_id|           timestamp|sender_account|receiver_account| amount|transaction_type|merchant_category| location|device_used|is_fraud|fraud_type|time_since_last_transaction|spending_deviation_score|velocity_score|geo_anomaly_score|payment_channel|     ip_address|device_hash|
+--------------+--------------------+--------------+----------------+-------+----------------+-----------------+---------+-----------+--------+----------+---------------------------+------------------------+--------------+-----------------+---------------+---------------+-----------+
|       T100002|2023-05-12 11:39:...|     ACC733052|       ACC377370|2773.86|         deposit|            other|   London|        pos|   false|  