In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

def transform_users_silver(df_bronze):
    """
    Modular transformation function.
    Includes NEW normalization logic to ensure data quality.
    """
    # 1. Standardize column headers to snake_case
    standardized_cols = [col.lower().replace(" ", "_") for col in df_bronze.columns]
    df_standardized = df_bronze.toDF(*standardized_cols)

    # 2. Quality Gates: user_id must exist and email must be valid
    id_valid = F.col("user_id").isNotNull()
    email_valid = F.col("email").contains("@")
    valid_mask = id_valid & email_valid

    # 3. Filter clean data
    df_clean = df_standardized.filter(valid_mask)

    # 4. NORMALIZATION (Added these steps to ensure a successful test)
    # We trim whitespace and force lowercase for consistent lookups
    df_normalized = df_clean.withColumn("email", F.lower(F.trim(F.col("email")))) \
                            .withColumn("load_dt", F.to_timestamp(F.col("load_dt")))

    # 5. Deduplication: Keep only the latest record
    window_spec = Window.partitionBy("user_id").orderBy(F.col("load_dt").desc())

    df_silver_final = df_normalized.withColumn("row_rank", F.row_number().over(window_spec)) \
        .filter("row_rank == 1") \
        .drop("row_rank") \
        .select("user_id", "user_name", "email", "load_dt")

    return df_silver_final

In [0]:
import unittest
import pandas as pd
import io
from pyspark.sql.types import *
from pyspark.testing import assertDataFrameEqual

class TestUsersSilver(unittest.TestCase):
    def test_email_normalization_and_deduplication(self):
        # MOCK INPUT: Messy casing and extra spaces
        input_data = [(201, "Alice", " ALICE@work.com ", "2026-01-02 10:00:00")]
        input_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("user_name", StringType(), True),
            StructField("email", StringType(), True),
            StructField("load_dt", StringType(), True)
        ])
        df_input = spark.createDataFrame(input_data, input_schema)

        # EXPECTED OUTPUT: Cleaned and localized
        expected_ts = pd.Timestamp("2026-01-02 10:00:00").tz_localize('UTC')
        expected_data = [(201, "Alice", "alice@work.com", expected_ts)]
        
        expected_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("user_name", StringType(), True),
            StructField("email", StringType(), True),
            StructField("load_dt", TimestampType(), True)
        ])
        df_expected = spark.createDataFrame(expected_data, expected_schema)

        # ACT: Run the function
        df_actual = transform_users_silver(df_input)

        # ASSERT: This would fail if normalization wasn't implemented
        assertDataFrameEqual(df_actual, df_expected)

# Run the test
runner = unittest.TextTestRunner(verbosity=2)
runner.run(unittest.TestLoader().loadTestsFromTestCase(TestUsersSilver))