In [None]:
#Setting Spark with MinIO
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,FloatType
from pyspark.sql.functions import col,when
import os
from pyspark.sql import SparkSession
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")
S3_ENDPOINT = os.getenv("S3_ENDPOINT")
BUCKET_NAME = os.getenv("BUCKET_NAME")
POSTGRES_USER=os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD=os.getenv("POSTGRES_PASSWORD")
POSTGRES_ENDPOINT=os.getenv("POSTGRES_ENDPOINT")
DATA_ENPOINT=os.getenv("DATA_ENPOINT")
spark = SparkSession.builder \
    .appName("MinIOReader") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.endpoint", S3_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", S3_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", S3_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.jars", "/home/harshithts/jars/postgresql-42.7.3.jar") \
    .config("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .getOrCreate()

spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")



25/11/26 19:30:50 WARN Utils: Your hostname, harshithts-HP-Pavilion-Gaming-Laptop-15-ec2xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface wlo1)
25/11/26 19:30:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/harshithts/RealTimeData-Pipeline/env/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/harshithts/.ivy2/cache
The jars for the packages stored in: /home/harshithts/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bad1be75-454f-4da6-8b0d-c2972fb763ae;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 244ms :: artifacts dl 11ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	------------------------

: 

In [None]:

transaction_schema = StructType([
    StructField("txn_id", StringType()),
    StructField("user_id", StringType()),
    StructField("amount", FloatType()),
    StructField("merchant_id", StringType()),
    StructField("channel", StringType()),
    StructField("location", StringType()),
    StructField("timestamp", StringType()),
])

transaction_df = spark.readStream \
    .format("json") \
    .schema(transaction_schema) \
    .load(f"s3a://{BUCKET_NAME}/bronze/transactions-folder/")

# ================================
#           FRAUD RULES
# ================================
SuspiciousMerchant = ["M0008", "M0009"]

fraud_df = transaction_df.withColumn(
    "frauds",
    when((col("amount") > 30000) & (col("channel") == "Online"), "HighOnlineAmount")
    .when(col("merchant_id").isin(SuspiciousMerchant), "SuspiciousMerchant")
    .when((col("channel") == "Online") & 
          (col("location").isin("Port Shannon", "New Pamela")), "RiskLocation")
    .when(col("amount") > 30000, "HugeAmount")
    .otherwise("Normal")
)

fraud_illegal = fraud_df.filter(col("frauds") != "Normal")  # only fraud
genuine_df = fraud_df.filter(col("frauds") == "Normal")     # normal ones


# ================================
#       WRITE FUNCTIONS
# ================================
def writeFraud(batch_df, batch_id):
    batch_df.write \
        .format("jdbc") \
        .option("url", DATA_ENPOINT) \
        .option("dbtable", "public.FraudTransaction") \
        .option("user", POSTGRES_USER) \
        .option("password", POSTGRES_PASSWORD) \
        .option("driver", "org.postgresql.Driver") \
        .mode("append") \
        .save()


def writeGenuine(batch_df, batch_id):
    batch_df.write \
        .format("jdbc") \
        .option("url", DATA_ENPOINT) \
        .option("dbtable", "public.GenuineTransaction") \
        .option("user", POSTGRES_USER) \
        .option("password", POSTGRES_PASSWORD) \
        .option("driver", "org.postgresql.Driver") \
        .mode("append") \
        .save()


25/11/26 19:30:57 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [None]:
# FRAUD → POSTGRES
fraud_query = fraud_illegal.writeStream \
    .foreachBatch(writeFraud) \
    .outputMode("append") \
    .option("checkpointLocation", f"s3a://{BUCKET_NAME}/checkpoint/fraud_write") \
    .start()

# GENUINE → POSTGRES
genuine_query = genuine_df.writeStream \
    .foreachBatch(writeGenuine) \
    .outputMode("append") \
    .option("checkpointLocation", f"s3a://{BUCKET_NAME}/checkpoint/genuine_write") \
    .start()

# FRAUD → CONSOLE
console_query = fraud_illegal.writeStream \
    .format("console") \
    .outputMode("append") \
    .option("checkpointLocation", f"s3a://{BUCKET_NAME}/checkpoint/fraud_console") \
    .start()

spark.streams.awaitAnyTermination()


