##Old working code

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, current_timestamp, concat_ws, sha2, sum as _sum, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


# function for checking if DF is streaming
def is_streaming(df):
    return hasattr(df, 'isStreaming') and df.isStreaming

# defining UDF to generate flag_id example 'F001'
@udf(StringType())
def generate_flag_id(transaction_id):
    prefix = "F"
    suffix = str(transaction_id).zfill(3)
    return f"{prefix}{suffix}"

# defining transformation logic to fraud_flags table
def create_fraud_flags(transactions_df):
    fraud_flags_df = transactions_df \
        .withColumn("flag_type", when(col("amount") > 50000, "unusual_amount")
                    .when(col("channel") == "mobile", "velocity_check")
                    .when(col("transaction_type") == "transfer", "pattern_anomaly")
                    .otherwise(lit(None))) \
        .withColumn("confidence_score", when(col("flag_type") == "unusual_amount", 0.9)
                    .when(col("flag_type") == "velocity_check", 0.8)
                    .when(col("flag_type") == "pattern_anomaly", 0.7)
                    .otherwise(lit(0.5))) \
        .withColumn("flag_id", generate_flag_id(col("transaction_id")))

    fraud_flags_df = fraud_flags_df \
        .filter(col("flag_type").isNotNull()) \
        .select("flag_id", "transaction_id", "flag_type", "confidence_score", "timestamp")

    return fraud_flags_df

# defining transformation logic to customer_segments table
def create_customer_segments(transactions_df, customers_df):
    # converting timestamp to TimestampType
    transactions_df = transactions_df.withColumn("timestamp", to_timestamp(col("timestamp")))

    # adding watermark to handling late arrival data
    transactions_df = transactions_df.withWatermark("timestamp", "1 day")

    # doing transaction and customers df in temporary view
    transactions_df.createOrReplaceTempView("transactions_view")
    customers_df.createOrReplaceTempView("customers_view")

    # sql queries for streaming aggregation
    high_value_query = """
        SELECT customer_id, 'High_Value' AS segment_name, 'Customers with high transaction volume' AS segment_description
        FROM (
            SELECT customer_id, SUM(amount) AS total_amount
            FROM transactions_view
            GROUP BY customer_id
            HAVING SUM(amount) > 100000
        )
    """

    new_user_query = """
        SELECT customer_id, 'New_User' AS segment_name, 'Customers who joined in last 30 days' AS segment_description
        FROM customers_view
        WHERE join_date > current_date() - INTERVAL 30 DAYS
    """

    high_value_df = spark.sql(high_value_query)
    new_user_df = spark.sql(new_user_query)

    # combining segments
    customer_segments_df = high_value_df.union(new_user_df) \
        .withColumn("segment_id", sha2(concat_ws("_", col("customer_id"), col("segment_name")), 256)) \
        .withColumn("last_updated", current_timestamp())

    return customer_segments_df

# function to ingest streaming data for fraud flags
def ingest_fraud_flags():
    transactions_stream_df = spark.readStream.table("silver.transactions")

    # print to check where df is streaming or not
    print("transactions df to fraud flags streaming:", is_streaming(transactions_stream_df))

    fraud_flags_stream_df = create_fraud_flags(transactions_stream_df)

    # print whether the df is streaming
    print("Fraud Flags DataFrame is streaming:", is_streaming(fraud_flags_stream_df))

    query = fraud_flags_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.fraud_flag") \
        .outputMode("append") \
        .toTable("silver.fraud_flags")
  

    return query

# function to ingest streaming data for customer segments
def ingest_customer_segments():
    transactions_stream_df = spark.readStream.table("silver.transactions")

    customers_stream_df = spark.readStream.table("silver.customer")

    # print whether the df's are streaming
    print("transactions df for customer segments streaming:", is_streaming(transactions_stream_df))
    print("customers df is streaming:", is_streaming(customers_stream_df))

    customer_segments_stream_df = create_customer_segments(transactions_stream_df, customers_stream_df)

    # print whether the df is streaming
    print("Customer Segments DataFrame is streaming:", is_streaming(customer_segments_stream_df))

    # for streaming aggregations, using 'Complete' mode
    query = customer_segments_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.customer_segments") \
        .outputMode("complete") \
        .toTable("silver.customer_segments")

    return query

# starting streaming 
query_fraud_flags = ingest_fraud_flags()
query_customer_segments = ingest_customer_segments()

# await termination to streaming queries
query_fraud_flags.awaitTermination()
query_customer_segments.awaitTermination()


transactions df to fraud flags streaming: True
Fraud Flags DataFrame is streaming: True
transactions df for customer segments streaming: True
customers df is streaming: True
Customer Segments DataFrame is streaming: True


##New code without trigger 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, current_timestamp, to_timestamp
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

# Initialize Spark session
spark = SparkSession.builder.appName("Real-time Data Pipeline").getOrCreate()

# Function for checking if DataFrame is streaming
def is_streaming(df):
    return hasattr(df, 'isStreaming') and df.isStreaming

# Defining UDF to generate flag_id example 'F001'
@udf(StringType())
def generate_flag_id(transaction_id):
    prefix = "F"
    suffix = str(transaction_id).zfill(3)
    return f"{prefix}{suffix}"

# Defining UDF to generate segment_id example 'S001'
@udf(StringType())
def generate_segment_id(customer_id):
    prefix = "S"
    suffix = str(customer_id).zfill(3)
    return f"{prefix}{suffix}"

# Defining transformation logic to fraud_flags table
def create_fraud_flags(transactions_df):
    fraud_flags_df = transactions_df \
        .withColumn("flag_type", when(col("amount") > 50000, "unusual_amount")
                    .when(col("channel") == "mobile", "velocity_check")
                    .when(col("transaction_type") == "transfer", "pattern_anomaly")
                    .otherwise(lit(None))) \
        .withColumn("confidence_score", when(col("flag_type") == "unusual_amount", 0.9)
                    .when(col("flag_type") == "velocity_check", 0.8)
                    .when(col("flag_type") == "pattern_anomaly", 0.7)
                    .otherwise(lit(0.5))) \
        .withColumn("flag_id", generate_flag_id(col("transaction_id")))

    fraud_flags_df = fraud_flags_df \
        .filter(col("flag_type").isNotNull()) \
        .select("flag_id", "transaction_id", "flag_type", "confidence_score", "timestamp")

    return fraud_flags_df

# Defining transformation logic to customer_segments table
def create_customer_segments(transactions_df, customers_df):
    # Converting timestamp to TimestampType
    transactions_df = transactions_df.withColumn("timestamp", to_timestamp(col("timestamp")))

    # Adding watermark to handle late arrival data
    transactions_df = transactions_df.withWatermark("timestamp", "1 day")

    # Joining transactions and customers df
    joined_df = transactions_df.join(customers_df, "customer_id")

    # Creating temporary view for joined data
    joined_df.createOrReplaceTempView("joined_view")

    # SQL queries for streaming aggregation
    high_value_query = """
        SELECT customer_id, 'High Value' AS segment_name, 'Customers with high transaction volume' AS segment_description
        FROM (
            SELECT customer_id, SUM(amount) AS total_amount
            FROM joined_view
            GROUP BY customer_id
            HAVING SUM(amount) > 100000
        )
    """

    new_user_query = """
        SELECT customer_id, 'New User' AS segment_name, 'Customers who joined in last 30 days' AS segment_description
        FROM joined_view
        WHERE join_date > current_date() - INTERVAL 30 DAYS
    """

    inactive_query = """
        SELECT customer_id, 'Inactive' AS segment_name, 'No transactions in last 90 days' AS segment_description
        FROM joined_view
        WHERE last_update < current_date() - INTERVAL 90 DAYS
    """

    credit_risk_query = """
        SELECT customer_id, 'Credit Risk' AS segment_name, 'Customers with low credit scores' AS segment_description
        FROM joined_view
        WHERE credit_score < 600
    """

    loyal_query = """
        SELECT customer_id, 'Loyal' AS segment_name, 'Consistent activity for over 5 years' AS segment_description
        FROM joined_view
        WHERE DATEDIFF(current_date(), join_date) > 5 * 365
    """

    high_value_df = spark.sql(high_value_query)
    new_user_df = spark.sql(new_user_query)
    inactive_df = spark.sql(inactive_query)
    credit_risk_df = spark.sql(credit_risk_query)
    loyal_df = spark.sql(loyal_query)

    # Combining segments
    customer_segments_df = high_value_df.union(new_user_df) \
        .union(inactive_df) \
        .union(credit_risk_df) \
        .union(loyal_df) \
        .withColumn("segment_id", generate_segment_id(col("customer_id"))) \
        .withColumn("last_updated", current_timestamp())

    return customer_segments_df

# Function to ingest streaming data for fraud flags
def ingest_fraud_flags():
    transactions_stream_df = spark.readStream.table("silver.transactions")

    # Print to check if df is streaming
    print("transactions df to fraud flags streaming:", is_streaming(transactions_stream_df))

    fraud_flags_stream_df = create_fraud_flags(transactions_stream_df)

    # Print whether the df is streaming
    print("Fraud Flags DataFrame is streaming:", is_streaming(fraud_flags_stream_df))

    query = fraud_flags_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.fraud_flag") \
        .outputMode("append") \
        .trigger(processingTime="10 seconds")  # Set the trigger interval here
        .toTable("silver.fraud_flags")
  
    return query

# Function to ingest streaming data for customer segments
def ingest_customer_segments():
    transactions_stream_df = spark.readStream.table("silver.transactions")
    customers_df = spark.read.table("silver.customer")  # Static table

    # Print whether the df's are streaming
    print("transactions df for customer segments streaming:", is_streaming(transactions_stream_df))
    print("customers df is streaming:", is_streaming(customers_df))

    customer_segments_stream_df = create_customer_segments(transactions_stream_df, customers_df)

    # Print whether the df is streaming
    print("Customer Segments DataFrame is streaming:", is_streaming(customer_segments_stream_df))

    # For streaming aggregations, using 'Complete' mode
    query = customer_segments_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.customer_segments") \
        .outputMode("complete") \
        .trigger(processingTime="10 seconds")  # Set the trigger interval here
        .toTable("silver.customer_segments")

    return query

# Starting streaming 
query_fraud_flags = ingest_fraud_flags()
query_customer_segments = ingest_customer_segments()

# Await termination of streaming queries
query_fraud_flags.awaitTermination()
query_customer_segments.awaitTermination()


transactions df to fraud flags streaming: True
Fraud Flags DataFrame is streaming: True
transactions df for customer segments streaming: True
customers df is streaming: False
Customer Segments DataFrame is streaming: True


##New working code with trigger

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, current_timestamp, to_timestamp
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf


# Function for checking if DataFrame is streaming
def is_streaming(df):
    return hasattr(df, 'isStreaming') and df.isStreaming

# Defining UDF to generate flag_id example 'F001'
@udf(StringType())
def generate_flag_id(transaction_id):
    prefix = "F"
    suffix = str(transaction_id).zfill(3)
    return f"{prefix}{suffix}"

# Defining UDF to generate segment_id example 'S001'
@udf(StringType())
def generate_segment_id(customer_id):
    prefix = "S"
    suffix = str(customer_id).zfill(3)
    return f"{prefix}{suffix}"

# Defining transformation logic to fraud_flags table
def create_fraud_flags(transactions_df):
    fraud_flags_df = transactions_df \
        .withColumn("flag_type", when(col("amount") > 50000, "unusual_amount")
                    .when(col("channel") == "mobile", "velocity_check")
                    .when(col("transaction_type") == "transfer", "pattern_anomaly")
                    .otherwise(lit(None))) \
        .withColumn("confidence_score", when(col("flag_type") == "unusual_amount", 0.9)
                    .when(col("flag_type") == "velocity_check", 0.8)
                    .when(col("flag_type") == "pattern_anomaly", 0.7)
                    .otherwise(lit(0.5))) \
        .withColumn("flag_id", generate_flag_id(col("transaction_id")))

    fraud_flags_df = fraud_flags_df \
        .filter(col("flag_type").isNotNull()) \
        .select("flag_id", "transaction_id", "flag_type", "confidence_score", "timestamp")

    return fraud_flags_df

# Defining transformation logic to customer_segments table
def create_customer_segments(transactions_df, customers_df):
    # Converting timestamp to TimestampType
    transactions_df = transactions_df.withColumn("timestamp", to_timestamp(col("timestamp")))

    # Adding watermark to handle late arrival data
    transactions_df = transactions_df.withWatermark("timestamp", "1 day")

    # Joining transactions and customers df
    joined_df = transactions_df.join(customers_df, "customer_id")

    # Creating temporary view for joined data
    joined_df.createOrReplaceTempView("joined_view")

    # SQL queries for streaming aggregation
    high_value_query = """
        SELECT customer_id, 'High Value' AS segment_name, 'Customers with high transaction volume' AS segment_description
        FROM (
            SELECT customer_id, SUM(amount) AS total_amount
            FROM joined_view
            GROUP BY customer_id
            HAVING SUM(amount) > 100000
        )
    """

    new_user_query = """
        SELECT customer_id, 'New User' AS segment_name, 'Customers who joined in last 30 days' AS segment_description
        FROM joined_view
        WHERE join_date > current_date() - INTERVAL 30 DAYS
    """

    inactive_query = """
        SELECT customer_id, 'Inactive' AS segment_name, 'No transactions in last 90 days' AS segment_description
        FROM joined_view
        WHERE last_update < current_date() - INTERVAL 90 DAYS
    """

    credit_risk_query = """
        SELECT customer_id, 'Credit Risk' AS segment_name, 'Customers with low credit scores' AS segment_description
        FROM joined_view
        WHERE credit_score < 600
    """

    loyal_query = """
        SELECT customer_id, 'Loyal' AS segment_name, 'Consistent activity for over 5 years' AS segment_description
        FROM joined_view
        WHERE DATEDIFF(current_date(), join_date) > 5 * 365
    """

    high_value_df = spark.sql(high_value_query)
    new_user_df = spark.sql(new_user_query)
    inactive_df = spark.sql(inactive_query)
    credit_risk_df = spark.sql(credit_risk_query)
    loyal_df = spark.sql(loyal_query)

    # Combining segments
    customer_segments_df = high_value_df.union(new_user_df) \
        .union(inactive_df) \
        .union(credit_risk_df) \
        .union(loyal_df) \
        .withColumn("segment_id", generate_segment_id(col("customer_id"))) \
        .withColumn("last_updated", current_timestamp())

    return customer_segments_df

# Function to ingest streaming data for fraud flags
def ingest_fraud_flags():
    transactions_stream_df = spark.readStream.table("silver.transactions")

    # Print to check if df is streaming
    print("transactions df to fraud flags streaming:", is_streaming(transactions_stream_df))

    fraud_flags_stream_df = create_fraud_flags(transactions_stream_df)

    # Print whether the df is streaming
    print("Fraud Flags DataFrame is streaming:", is_streaming(fraud_flags_stream_df))

    query = fraud_flags_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.fraud_flag") \
        .outputMode("append") \
        .trigger(processingTime="10 seconds") \
        .toTable("silver.fraud_flags")
  
    return query

# Function to ingest streaming data for customer segments
def ingest_customer_segments():
    transactions_stream_df = spark.readStream.table("silver.transactions")
    customers_df = spark.read.table("silver.customer")  # Static table

    # Print whether the df's are streaming
    print("transactions df for customer segments streaming:", is_streaming(transactions_stream_df))
    print("customers df is streaming:", is_streaming(customers_df))

    customer_segments_stream_df = create_customer_segments(transactions_stream_df, customers_df)

    # Print whether the df is streaming
    print("Customer Segments DataFrame is streaming:", is_streaming(customer_segments_stream_df))

    # For streaming aggregations, using 'Complete' mode
    query = customer_segments_stream_df.writeStream \
        .format("delta") \
        .option("checkpointLocation", "silver.checkpoint.customer_segments") \
        .outputMode("complete") \
        .trigger(processingTime="10 seconds") \
        .toTable("silver.customer_segments")

    return query

# Starting streaming 
query_fraud_flags = ingest_fraud_flags()
query_customer_segments = ingest_customer_segments()

# Await termination of streaming queries
query_fraud_flags.awaitTermination()
query_customer_segments.awaitTermination()


transactions df to fraud flags streaming: True
Fraud Flags DataFrame is streaming: True
transactions df for customer segments streaming: True
customers df is streaming: False
Customer Segments DataFrame is streaming: True
