In [1]:
%run nb_03_dim_date_wrangler

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 3, Finished, Available, Finished)

In [2]:
import unittest
import datetime

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DateType, ShortType
from pyspark.testing.utils import assertDataFrameEqual, assertSchemaEqual
from delta.tables import DeltaTable

class TestDimDateWrangler(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        
        cls.spark = SparkSession.builder.appName('dim_date_test').getOrCreate()
        cls.delta_table_name = 'dim_date_test'
        DimDateWrangler.create_delta_table(cls.spark, cls.delta_table_name)

        cls.spark.sql(f'DELETE FROM {cls.delta_table_name}')


    def expected_schema(self):

        return StructType([
            StructField('ReportedDate', DateType()),
            StructField('Month', ShortType()),
            StructField('Quarter', ShortType()),
            StructField('Year', ShortType()),
        ])


    def test_extract_silver_df(self):

        from pyspark.sql import Row

        sample_data = [
            Row(ReportedDate=datetime.date(2023, 12, 5)),
            Row(ReportedDate=datetime.date(2023, 12, 5)),  # duplicate
        ]

        schema = StructType([StructField('ReportedDate', DateType())])
        df = spark.createDataFrame(sample_data, schema)
        result_df = DimDateWrangler.extract_silver_df(df)

        assert result_df.count() == 1, "One record should be inserted"

        row = result_df.first()
        assert row['ReportedDate'] == datetime.date(2023, 12, 5)
        assert row['Month'] == 12
        assert row['Quarter'] == 4
        assert row['Year'] == 2023


    def test_create_delta_table_schema(self):

        table_schema = spark.table(self.delta_table_name).schema
        assertSchemaEqual(table_schema, self.expected_schema())


    def test_upsert_delta_table_insert(self):

        sample_data = [
            (datetime.date(2023, 12, 5), 12, 4, 2023),
        ]

        schema = self.expected_schema()
        df = spark.createDataFrame(sample_data, schema)
        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDateWrangler.upsert_delta_table(delta_table, df)

        result_df = spark.sql(f"SELECT * FROM {self.delta_table_name}")
        assert result_df.count() == 1, "One record should be inserted"

        row = result_df.first()
        assert row['ReportedDate'] == datetime.date(2023, 12, 5)
        assert row['Month'] == 12
        assert row['Quarter'] == 4
        assert row['Year'] == 2023

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    def test_upsert_delta_table_no_update_on_match(self):

        sample_data = [
            (datetime.date(2023, 12, 5), 12, 4, 2023),
        ]

        schema = self.expected_schema()
        df = spark.createDataFrame(sample_data, schema)
        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDateWrangler.upsert_delta_table(delta_table, df)

        # Upsert again with the same data
        DimDateWrangler.upsert_delta_table(delta_table, df)
        result_df = spark.sql(f"SELECT * FROM {self.delta_table_name}")

        assert result_df.count() == 1, "No duplicate should be inserted"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    def test_upsert_delta_table_insert_on_match(self):

        sample_data = [
            (datetime.date(2023, 12, 5), 12, 4, 2023),
            (datetime.date(2024, 1, 15), 1, 1, 2024),
        ]

        schema = self.expected_schema()
        df = spark.createDataFrame([sample_data[0]], schema)
        delta_table = DeltaTable.forName(spark, self.delta_table_name)

        DimDateWrangler.upsert_delta_table(delta_table, df)
        result_df = spark.sql(f"SELECT * FROM {self.delta_table_name}")

        assert result_df.count() == 1, "One record should be inserted"

        # Upsert again with new data
        df = spark.createDataFrame([sample_data[1]], schema)
        DimDateWrangler.upsert_delta_table(delta_table, df)
        result_df = spark.sql(f"SELECT * FROM {self.delta_table_name}")

        assert result_df.count() == 2, "No duplicate should be inserted"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    def test_upsert_delta_table_insert_multiple(self):
        # Insert multiple new rows
        sample_data = [
            (datetime.date(2023, 12, 5), 12, 4, 2023),
            (datetime.date(2024, 1, 15), 1, 1, 2024),
        ]
        schema = self.expected_schema()
        df = spark.createDataFrame(sample_data, schema)
        delta_table = DeltaTable.forName(self.spark, self.delta_table_name)
        DimDateWrangler.upsert_delta_table(delta_table, df)

        result_df = self.spark.sql(f"SELECT * FROM {self.delta_table_name}")
        assert result_df.count() == 2, f"Two records should be present {result_df.count()}"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    @classmethod
    def tearDownClass(cls):

        cls.spark.sql(f"DROP TABLE IF EXISTS {cls.delta_table_name}")
        cls.spark.stop()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 4, Finished, Available, Finished)



In [3]:
test_case = TestDimDateWrangler()
TestDimDateWrangler.setUpClass()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 5, Finished, Available, Finished)

In [4]:
test_case.test_extract_silver_df()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 6, Finished, Available, Finished)

In [5]:
test_case.test_create_delta_table_schema()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 7, Finished, Available, Finished)

In [6]:
test_case.test_upsert_delta_table_insert()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 8, Finished, Available, Finished)

In [7]:
test_case.test_upsert_delta_table_no_update_on_match()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 9, Finished, Available, Finished)

In [8]:
test_case.test_upsert_delta_table_insert_on_match()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 10, Finished, Available, Finished)

In [9]:
test_case.test_upsert_delta_table_insert_multiple()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 11, Finished, Available, Finished)

In [10]:
TestDimDateWrangler.tearDownClass()

StatementMeta(, 4c11202f-d9a5-4c15-a382-be1ca2aec3fa, 12, Finished, Available, Finished)