In [0]:
import pandas as pd
import datetime
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.testing import assertDataFrameEqual, assertSchemaEqual

# --- THE TRANSFORMATION FUNCTION ---
def transform_paymentmethods_silver(df_bronze):
    """
    Encapsulates standardization, quality gates, and deduplication.
    """
    # 1. Standardize column names (snake_case)
    standardized_cols = [col.lower().strip().replace(" ", "_").replace("-", "_") for col in df_bronze.columns]
    df_standardized = df_bronze.toDF(*standardized_cols)
    
    # 2. Quality Gate: filter records with missing method_id
    df_clean = df_standardized.filter(F.col("method_id").isNotNull())
    
    # 3. Deduplication: Keep the most recent record based on load_dt
    window_spec = Window.partitionBy("method_id").orderBy(F.col("load_dt").desc())
    
    df_silver_final = df_clean.withColumn("row_rank", F.row_number().over(window_spec)) \
        .filter("row_rank == 1") \
        .drop("row_rank") \
        .withColumn("load_dt", F.to_timestamp(F.col("load_dt"))) \
        .withColumn("method_name", F.initcap(F.col("method_name"))) \
        .select("category", "method_id", "method_name", "load_dt", "source")
        
    return df_silver_final

In [0]:
import unittest
import io

class TestPaymentMethods(unittest.TestCase):
    def setUp(self):
        # Define the schema for input (Bronze)
        self.input_schema = StructType([
            StructField("category", StringType(), True),
            StructField("method_id", LongType(), True),
            StructField("method_name", StringType(), True),
            StructField("load_dt", StringType(), True),
            StructField("source", StringType(), True)
        ])

    def test_logic_correctness(self):
        # 1. CREATE MOCK DATA
        # Row 1 & 2 are duplicates; Row 3 has a NULL ID
        data = [
            ("card", 2, "credit card", "2026-01-01 10:00:00", "src"),
            ("card", 2, "credit card", "2026-01-02 10:00:00", "src"),
            ("cash", None, "cash", "2026-01-01 10:00:00", "src")
        ]
        df_input = spark.createDataFrame(data, self.input_schema)

        # 2. DEFINE EXPECTED OUTPUT
        expected_schema = StructType([
            StructField("category", StringType(), True),
            StructField("method_id", LongType(), True),
            StructField("method_name", StringType(), True),
            StructField("load_dt", TimestampType(), True),
            StructField("source", StringType(), True)
        ])
        
        # FIX: Localize timestamp to UTC to prevent TypeError
        expected_ts = pd.Timestamp("2026-01-02 10:00:00").tz_localize('UTC')
        
        expected_data = [
            ("card", 2, "Credit Card", expected_ts, "src")
        ]
        df_expected = spark.createDataFrame(expected_data, expected_schema)

        # 3. RUN TRANSFORMATION
        df_actual = transform_paymentmethods_silver(df_input)

        # 4. ASSERTIONS
        assertSchemaEqual(df_actual.schema, expected_schema)
        assertDataFrameEqual(df_actual, df_expected)

# --- EXECUTION AND REPORT GENERATION ---
stream = io.StringIO()
runner = unittest.TextTestRunner(stream=stream, verbosity=2)
suite = unittest.TestLoader().loadTestsFromTestCase(TestPaymentMethods)
result = runner.run(suite)

# Print Final Report
print(f"""
=========================================
UNIT TEST REPORT SUMMARY
=========================================
Status: {'SUCCESS' if result.wasSuccessful() else 'FAILED'}
Tests Run: {result.testsRun}
Errors: {len(result.errors)}
Failures: {len(result.failures)}
-----------------------------------------
Detailed Logs:
{stream.getvalue()}
=========================================
""")