In [0]:
import pandas as pd
import datetime
import io
import unittest
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.testing import assertDataFrameEqual, assertSchemaEqual

def transform_menuitems_silver(df_bronze):
    """
    Industry Standard Transformation Function.
    Input: Bronze DataFrame (Raw)
    Output: Silver DataFrame (Cleaned, Deduped, Validated)
    """
    # 1. Standardization
    standardized_cols = [col.lower().replace(" ", "_") for col in df_bronze.columns]
    df_standardized = df_bronze.toDF(*standardized_cols)
    
    # 2. Validation Mask
    valid_mask = (F.col("item_id").isNotNull()) & (F.col("price") > 0)
    
    # 3. Clean, Cast, and Dedupe
    window_spec = Window.partitionBy("item_id").orderBy(F.col("load_dt").desc())
    
    df_silver_final = df_standardized.filter(valid_mask) \
        .withColumn("row_rank", F.row_number().over(window_spec)) \
        .filter("row_rank == 1") \
        .drop("row_rank") \
        .withColumn("price", F.round(F.col("price").cast("double"), 2)) \
        .withColumn("load_dt", F.to_timestamp(F.col("load_dt"))) \
        .select("category", "is_seasonal", "item_id", "item_name", "price", "load_dt", "source")
        
    return df_silver_final

In [0]:
import pandas as pd
import datetime
import io
import unittest
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.testing import assertDataFrameEqual, assertSchemaEqual

def transform_menuitems_silver(df_bronze):
    """
    Industry Standard Transformation Function.
    Input: Bronze DataFrame (Raw)
    Output: Silver DataFrame (Cleaned, Deduped, Validated)
    """
    # 1. Standardization
    standardized_cols = [col.lower().replace(" ", "_") for col in df_bronze.columns]
    df_standardized = df_bronze.toDF(*standardized_cols)
    
    # 2. Validation Mask
    valid_mask = (F.col("item_id").isNotNull()) & (F.col("price") > 0)
    
    # 3. Clean, Cast, and Dedupe
    window_spec = Window.partitionBy("item_id").orderBy(F.col("load_dt").desc())
    
    df_silver_final = df_standardized.filter(valid_mask) \
        .withColumn("row_rank", F.row_number().over(window_spec)) \
        .filter("row_rank == 1") \
        .drop("row_rank") \
        .withColumn("price", F.round(F.col("price").cast("double"), 2)) \
        .withColumn("load_dt", F.to_timestamp(F.col("load_dt"))) \
        .select("category", "is_seasonal", "item_id", "item_name", "price", "load_dt", "source")
        
    return df_silver_final

In [0]:
class TestMenuitemsSilver(unittest.TestCase):

    def setUp(self):
        """Setup input schema for mock data"""
        self.schema = StructType([
            StructField("category", StringType(), True),
            StructField("is_seasonal", BooleanType(), True),
            StructField("item_id", LongType(), True),
            StructField("item_name", StringType(), True),
            StructField("price", DoubleType(), True),
            StructField("load_dt", StringType(), True),
            StructField("source", StringType(), True)
        ])

    def test_deduplication_and_validation(self):
        # 1. CREATE MOCK BRONZE DATA
        # Row 1: Valid
        # Row 2: Duplicate (older, should be dropped)
        # Row 3: Invalid price (should be filtered)
        # Row 4: Null ID (should be filtered)
        data = [
            ("Pizza", False, 101, "Margherita", 12.50, "2024-01-02 10:00:00", "POS"),
            ("Pizza", False, 101, "Margherita", 12.00, "2024-01-01 10:00:00", "POS"), 
            ("Sides", True, 102, "Fries", -5.00, "2024-01-02 10:00:00", "Web"),
            ("Sides", True, None, "Invalid", 5.00, "2024-01-02 10:00:00", "Web")
        ]
        df_input = spark.createDataFrame(data, self.schema)

        # 2. DEFINE EXPECTED OUTPUT
        expected_schema = StructType([
            StructField("category", StringType(), True),
            StructField("is_seasonal", BooleanType(), True),
            StructField("item_id", LongType(), True),
            StructField("item_name", StringType(), True),
            StructField("price", DoubleType(), True),
            StructField("load_dt", TimestampType(), True),
            StructField("source", StringType(), True)
        ])
        
        # FIX: Explicitly localize the timestamp to UTC to solve the TypeError
        expected_ts = pd.Timestamp("2024-01-02 10:00:00").tz_localize('UTC')

        expected_data = [
            ("Pizza", False, 101, "Margherita", 12.50, expected_ts, "POS")
        ]
        df_expected = spark.createDataFrame(expected_data, expected_schema)

        # 3. EXECUTE TRANSFORMATION
        df_actual = transform_menuitems_silver(df_input)

        # 4. ASSERTIONS
        # Test Schema: Checks if load_dt was successfully converted to TimestampType
        assertSchemaEqual(df_actual.schema, expected_schema)
        
        # Test Data: Checks if duplicates and nulls were correctly removed
        assertDataFrameEqual(df_actual, df_expected)

# --- EXECUTION AND REPORT GENERATION ---
stream = io.StringIO()
runner = unittest.TextTestRunner(stream=stream, verbosity=2)
suite = unittest.TestLoader().loadTestsFromTestCase(TestMenuitemsSilver)
result = runner.run(suite)

print(f"""
=========================================
UNIT TEST REPORT - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
=========================================
Tests Run: {result.testsRun}
Errors: {len(result.errors)}
Failures: {len(result.failures)}
Outcome: {'SUCCESS' if result.wasSuccessful() else 'FAILED'}
-----------------------------------------
Details:
{stream.getvalue()}
=========================================
""")