In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import (StructType, StructField, StringType, DoubleType,
                               IntegerType, BooleanType, TimestampType)

# Schema that mirrors your JSON
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("sender_account", StringType(), True),
    StructField("receiver_account", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("merchant_category", StringType(), True),
    StructField("location", StringType(), True),
    StructField("device_used", StringType(), True),
    StructField("is_fraud", StringType(), True),
    StructField("fraud_type", StringType(), True),
    StructField("time_since_last_transaction", StringType(), True),
    StructField("spending_deviation_score", StringType(), True),
    StructField("velocity_score", StringType(), True),
    StructField("geo_anomaly_score", StringType(), True),
    StructField("payment_channel", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("device_hash", StringType(), True)
])

spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/ckpt")




In [0]:
# Read from Kafka
raw_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafkainterface.servebeer.com:9092")
    .option("subscribe", "streaming")
    .load())

In [0]:
# Parse JSON → struct → columns
parsed_df = (raw_df
    .select(from_json(col("value").cast("string"), schema).alias("data"))
    .select("data.*"))          # flatten so each key is a top-level column


In [0]:
dbutils.fs.rm("dbfs:/tmp/ckpt/parsed_data", recurse=True)

Out[29]: True

In [0]:
# Stream to console (tabular like CSV)
query = parsed_df.writeStream \
    .format("memory") \
    .queryName("parsed_data") \
    .outputMode("append") \
    .start()



In [0]:
%sql
SELECT * FROM parsed_data


transaction_id,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,fraud_type,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash
T100000,2023-08-22T09:22:43.516,ACC877572,ACC388389,343.78,withdrawal,utilities,Tokyo,mobile,False,,,-0.21,3,0.22,card,13.101.214.112,D8536477
T100001,2023-08-04T01:58:02.607,ACC895667,ACC944962,419.65,withdrawal,online,Toronto,atm,False,,,-0.14,7,0.96,ACH,172.52.47.194,D2622631
T100002,2023-05-12T11:39:33.743,ACC733052,ACC377370,2773.86,deposit,other,London,pos,False,,,-1.78,20,0.89,card,185.98.35.23,D4823498
T100003,2023-10-10T06:04:43.195,ACC996865,ACC344098,1666.22,deposit,online,Sydney,pos,False,,,-0.6,6,0.37,wire_transfer,107.136.36.87,D9961380
T100004,2023-09-24T08:09:02.700,ACC584714,ACC497887,24.43,transfer,utilities,Toronto,mobile,False,,,0.79,13,0.27,ACH,108.161.108.255,D7637601
T100005,2023-11-20T17:49:27.941,ACC581141,ACC249811,58.77,transfer,entertainment,Berlin,atm,False,,,-1.63,19,0.4,UPI,112.70.252.46,D1790481
T100006,2023-11-11T11:15:41.359,ACC757924,ACC267753,59.51,payment,travel,Dubai,pos,False,,,-2.62,1,0.68,UPI,58.136.174.57,D5924115
T100007,2023-06-09T06:32:36.945,ACC103402,ACC857168,29.79,transfer,entertainment,London,atm,False,,,-0.48,5,0.37,ACH,82.0.165.250,D1326765
T100008,2023-02-11T06:57:40.586,ACC972064,ACC945964,16.0,transfer,utilities,New York,web,False,,,0.99,3,0.98,card,64.65.243.84,D5446912
T100009,2023-07-13T09:25:20.560,ACC543692,ACC322086,203.05,withdrawal,grocery,Dubai,atm,False,,,0.59,17,0.45,UPI,126.115.32.173,D1352896
