In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

print("Starting Silver Layer ETL (Transform Phase)...")

In [0]:
bronze_df = spark.table("bronze_credit_transactions")

print("Bronze data loaded successfully")
print(f"Bronze record count: {bronze_df.count()}")

In [0]:
bronze_df.head()

In [0]:
silver_df = (
    bronze_df
    # Remove duplicate transactions
    .dropDuplicates(["transaction_id"])
    
    # Handle categorical NULLs (USING CORRECT COLUMN NAMES)
    .fillna({
        "merchant": "UNKNOWN",
        "category": "UNKNOWN"
    })
    
    # SAFE date handling
    .withColumn(
        "transaction_date",
        expr("try_cast(trans_date_trans_time as date)")
    )
    
    # Cast amount
    .withColumn(
        "amt",
        col("amt").cast("double")
    )
    
    # Add processing timestamp
    .withColumn(
        "processed_timestamp",
        current_timestamp()
    )
)


In [0]:
silver_df.columns

In [0]:
print("Silver Data Sample")
silver_df.show(10, truncate=False)

print("Silver Schema")
silver_df.printSchema()

print("Invalid date records:",
      silver_df.filter(col("transaction_date").isNull()).count())


In [0]:
silver_df.head()

In [0]:
silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_credit_transactions")

print("Silver table written successfully!")


In [0]:
%sql
select * from silver_credit_transactions;