In [1]:
%run nb_03_dim_desc_wrangler

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 3, Finished, Available, Finished)

In [2]:
import unittest
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col
from pyspark.testing.utils import assertDataFrameEqual, assertSchemaEqual
from delta.tables import DeltaTable

class TestDimDescWrangler(unittest.TestCase):

    @classmethod
    def setUpClass(cls):

        cls.spark = SparkSession.builder.appName('dim_desc_test').getOrCreate()
        cls.delta_table_name = 'dim_desc_test'
        DimDescWrangler.create_delta_table(cls.spark, cls.delta_table_name)
        cls.spark.sql(f'DELETE FROM {cls.delta_table_name}')

    def expected_schema(self):
        return StructType([
            StructField('LevelOneDesc', StringType()),
            StructField('LevelTwoDesc', StringType()),
            StructField('LevelThreeDesc', StringType()),
            StructField('DescID', IntegerType()),
        ])

    def test_extract_silver_df(self):
        sample_data = [
            Row(LevelOneDesc='A', LevelTwoDesc='B', LevelThreeDesc='C'),
            Row(LevelOneDesc='A', LevelTwoDesc='B', LevelThreeDesc='C'),  # duplicate
            Row(LevelOneDesc='X', LevelTwoDesc='Y', LevelThreeDesc='Z'),
        ]

        schema = StructType([
            StructField('LevelOneDesc', StringType()),
            StructField('LevelTwoDesc', StringType()),
            StructField('LevelThreeDesc', StringType())
        ])

        df = spark.createDataFrame(sample_data, schema)
        result_df = DimDescWrangler.extract_silver_df(df)
        expected_data = [
            ('A', 'B', 'C'),
            ('X', 'Y', 'Z')
        ]
        expected_df = spark.createDataFrame(expected_data, schema)
        assertDataFrameEqual(result_df.orderBy('LevelOneDesc'), expected_df.orderBy('LevelOneDesc'))

    def test_create_delta_table_schema(self):
        
        table_schema = spark.table(self.delta_table_name).schema
        assertSchemaEqual(table_schema, self.expected_schema())

    def test_setup_id_insert(self):
        # Table is empty, should assign DescID starting from 1
        sample_data = [
            ('A', 'B', 'C'),
            ('X', 'Y', 'Z')
        ]
        schema = StructType([
            StructField('LevelOneDesc', StringType()),
            StructField('LevelTwoDesc', StringType()),
            StructField('LevelThreeDesc', StringType())
        ])
        df = spark.createDataFrame(sample_data, schema)    
        upsert_df = DimDescWrangler.setup_id(spark, self.delta_table_name, df)

        # DescID should be unique and start from 1
        desc_ids = [row['DescID'] for row in upsert_df.collect()]
        assert min(desc_ids) == 1
        assert len(set(desc_ids)) == 2

    def test_setup_id_increment(self):
        # Insert one row, then check that new IDs increment
        data1 = [('A', 'B', 'C', 1)]
        schema1 = self.expected_schema()
        df1 = spark.createDataFrame(data1, schema1)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDescWrangler.upsert_delta_table(delta_table, df1)
        
        # Now upsert a new row
        data2 = [('X', 'Y', 'Z')]
        schema2 = StructType([
            StructField('LevelOneDesc', StringType()),
            StructField('LevelTwoDesc', StringType()),
            StructField('LevelThreeDesc', StringType())
        ])
        df2 = spark.createDataFrame(data2, schema2)

        upsert_df = DimDescWrangler.setup_id(spark, self.delta_table_name, df2)
        desc_id = upsert_df.first()['DescID']
        assert desc_id == 2, f'Expected DescID 2, got {desc_id}'

    def test_upsert_delta_table_insert(self):
        # Insert a new row
        data = [('A', 'B', 'C', 1)]
        schema = self.expected_schema()
        df = spark.createDataFrame(data, schema)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDescWrangler.upsert_delta_table(delta_table, df)

        result_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert result_df.count() == 1
        row = result_df.first()
        assert row['LevelOneDesc'] == 'A'
        assert row['DescID'] == 1

        spark.sql(f'DELETE FROM {self.delta_table_name}')

    def test_upsert_delta_table_no_duplicate(self):
        # Insert the same row again, should not duplicate
        data = [('A', 'B', 'C', 1)]
        schema = self.expected_schema()
        df = spark.createDataFrame(data, schema)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDescWrangler.upsert_delta_table(delta_table, df)
        DimDescWrangler.upsert_delta_table(delta_table, df)
        
        result_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert result_df.count() == 1

        spark.sql(f'DELETE FROM {self.delta_table_name}')

    def test_upsert_delta_table_multiple(self):
        # Insert multiple new rows
        data = [
            ('A', 'B', 'C', 1),
            ('X', 'Y', 'Z', 2)
        ]
        schema = self.expected_schema()
        df = spark.createDataFrame(data, schema)
        
        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        DimDescWrangler.upsert_delta_table(delta_table, df)

        result_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert result_df.count() == 2

        spark.sql(f'DELETE FROM {self.delta_table_name}')

    @classmethod
    def tearDownClass(cls):
        cls.spark.sql(f'DROP TABLE IF EXISTS {cls.delta_table_name}')
        cls.spark.stop()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 4, Finished, Available, Finished)



In [3]:
test_case = TestDimDescWrangler()
TestDimDescWrangler.setUpClass()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 5, Finished, Available, Finished)

In [4]:
test_case.test_extract_silver_df()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 6, Finished, Available, Finished)

In [5]:
test_case.test_create_delta_table_schema()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 7, Finished, Available, Finished)

In [6]:
test_case.test_setup_id_insert()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 8, Finished, Available, Finished)

In [7]:
test_case.test_setup_id_increment()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 9, Finished, Available, Finished)

In [8]:
test_case.test_upsert_delta_table_insert()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 10, Finished, Available, Finished)

In [9]:
test_case.test_upsert_delta_table_no_duplicate()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 11, Finished, Available, Finished)

In [10]:
test_case.test_upsert_delta_table_multiple()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 12, Finished, Available, Finished)

In [11]:
TestDimDescWrangler.tearDownClass()

StatementMeta(, 1fcfb7de-78ab-4869-a8be-ac4a5ade6b50, 13, Finished, Available, Finished)