In [1]:
%run nb_02_sa_crime_record_wrangler

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 3, Finished, Available, Finished)

In [2]:
import unittest
import datetime
from pyspark.testing.utils import assertDataFrameEqual, assertSchemaEqual

class TestSACrimeRecordWrangler(unittest.TestCase):

    @classmethod
    def setUpClass(cls):

        cls.spark = SparkSession.builder.appName('crime_record_silver_test').getOrCreate()
        cls.delta_table_name = 'crime_record_silver_test'
        cls.delta_table = SACrimeRecordWrangler.create_crime_records_silver_table(cls.spark, cls.delta_table_name)

        spark.sql(f'DELETE FROM {cls.delta_table_name}')


    def expected_schema(self):
        
        return StructType([
            StructField('ReportedDate', DateType()),
            StructField('Suburb', StringType()),
            StructField('Postcode', ShortType()),
            StructField('LevelOneDesc', StringType()),
            StructField('LevelTwoDesc', StringType()),
            StructField('LevelThreeDesc', StringType()),
            StructField('Count', IntegerType()),
        ])


    def test_remove_all_na(self):

        sample_data = [
            {'ReportedDate': None, 'Suburb': None},
            {'ReportedDate': '',   'Suburb': ''}
        ]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 2, 'two rows inserted'

        transformed_df = SACrimeRecordWrangler.remove_all_na(original_df)
        expected_df = spark.createDataFrame(sample_data[-1:], schema)
        
        assertDataFrameEqual(transformed_df, expected_df)


    def test_reported_date_str_to_date(self):

        sample_data = [{'ReportedDate': '5/12/2023'}]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 1, 'one rows inserted'

        transformed_df = SACrimeRecordWrangler.reported_date_str_to_date(original_df)
        
        fetched_value = transformed_df.dtypes[0]
        assert fetched_value == ('ReportedDate', 'date'), f"date type is expected, but returned with '{fetched_value}'"


    def test_postcode_str_to_short(self):

        sample_data = [{'Postcode': '2233'}]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 1, 'one rows inserted'

        transformed_df = SACrimeRecordWrangler.postcode_str_to_short(original_df)
        fetched_value = transformed_df.first()['Postcode']
        assert fetched_value == 2233, f"expected integer 2233, but returned with '{fetched_value}'"


    def test_suburb_fill_null(self):

        sample_data = [{'Suburb': None}]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 1, 'one rows inserted'

        transformed_df = SACrimeRecordWrangler.suburb_fill_null_empty(original_df)
        fetched_value = transformed_df.first()['Suburb']
        assert fetched_value == 'N/A', f"expected 'N/A', but returned with '{fetched_value}'"


    def test_suburb_fill_empty(self):

        sample_data = [{'Suburb': ''}]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 1, 'one rows inserted'

        transformed_df = SACrimeRecordWrangler.suburb_fill_null_empty(original_df)
        fetched_value = transformed_df.first()['Suburb']
        assert fetched_value == 'N/A', f"expected 'N/A', but returned with '{fetched_value}'"


    def test_postcode_fill_null(self):

        sample_data = [{'Postcode': None}]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 1, 'one rows inserted'

        transformed_df = SACrimeRecordWrangler.postcode_str_to_short(original_df)
        transformed_df = SACrimeRecordWrangler.postcode_fill_null(transformed_df)
        fetched_value = transformed_df.first()['Postcode']
        assert fetched_value == 0, f"expected integer 0, but returned with '{fetched_value}'"


    def test_cleanse_df(self):

        sample_data = [
            {'ReportedDate': None, 'Suburb': None},
            {'ReportedDate': '5/12/2023', 'Suburb': '', 'Postcode': None}
        ]
        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        assert original_df.count() == 2, 'two rows inserted'

        cleansed_df = SACrimeRecordWrangler.cleanse_df(original_df)
        assert cleansed_df.count() == 1, 'one row left'
        assertSchemaEqual(cleansed_df.schema, self.expected_schema())

        reported_date = cleansed_df.dtypes[0]
        assert reported_date == ('ReportedDate', 'date'), f"date type is expected, but returned with '{reported_date}'"

        postcode = cleansed_df.first()['Postcode']
        assert postcode == 0, f"expected integer 0, but returned with '{postcode}'"

        suburb = cleansed_df.first()['Suburb']
        assert suburb == 'N/A', f"expected 'N/A', but returned with '{suburb}'"


    def test_insert_empty_delta_table(self):

        sample_data = [
            {'ReportedDate': '5/12/2023', 'Suburb': '', 'Postcode': None}
        ]

        schema = SACrimeRecordWrangler.create_init_schema()

        original_df = spark.createDataFrame(sample_data, schema)
        cleansed_df = SACrimeRecordWrangler.cleanse_df(original_df)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        SACrimeRecordWrangler.upsert_delta_table(delta_table, cleansed_df)

        fetched_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')

        assert fetched_df.count() == 1, 'one record is inserted'

        updated_ts = fetched_df.first()['UpdatedTS']
        assert updated_ts is not None, f"updated_ts is not None"
        assert isinstance(updated_ts, datetime.datetime), f"datetime is expected, but returned with '{type(updated_ts)}'"

        reported_date = fetched_df.dtypes[0]
        assert reported_date == ('ReportedDate', 'date'), f"date type is expected, but returned with '{reported_date}'"

        postcode = fetched_df.first()['Postcode']
        assert postcode == 0, f"expected integer , but returned with '{postcode}'"

        suburb = fetched_df.first()['Suburb']
        assert suburb == 'N/A', f"expected 'N/A', but returned with '{suburb}'"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    def test_insert_non_empty_delta_table(self):
        
        schema = SACrimeRecordWrangler.create_init_schema()

        sample_data = [
            {
                'ReportedDate': '5/12/2023',
                'Suburb': 's',
                'Postcode': '12',
                'LevelOneDesc': 'l1',
                'LevelTwoDesc': 'l2',
                'LevelThreeDesc': 'l3',
                'Count': 5
            }
        ]
        original_df = spark.createDataFrame(sample_data, schema)
        original_df = SACrimeRecordWrangler.cleanse_df(original_df)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        SACrimeRecordWrangler.upsert_delta_table(delta_table, original_df)

        fetched_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert fetched_df.count() == 1, f'one record is inserted, but returned {fetched_df.count()}'

        original_row = fetched_df.first()
        assert original_row['Suburb'] == 's', f"expected 's', but returned with '{original_row}'"

        insert_data = [
            {
                'ReportedDate': '5/12/2023',
                'Suburb': 'u',
                'Postcode': '12',
                'LevelOneDesc': 'l1',
                'LevelTwoDesc': 'l2',
                'LevelThreeDesc': 'l3',
                'Count': 7
            }
        ]

        insert_df = spark.createDataFrame(insert_data, schema)
        insert_df = SACrimeRecordWrangler.cleanse_df(insert_df)

        SACrimeRecordWrangler.upsert_delta_table(delta_table, insert_df)

        insert_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert insert_df.count() == 2, f'one record is updated, but returned {insert_df.count()}'

        insert_row = insert_df.sort('Suburb', ascending=False).first()
        assert insert_row['Suburb'] == 'u', f"expected 'u', but returned with '{insert_row}'"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    def test_upsert_delta_table(self):
        
        schema = SACrimeRecordWrangler.create_init_schema()

        sample_data = [
            {
                'ReportedDate': '5/12/2023',
                'Suburb': 's',
                'Postcode': '12',
                'LevelOneDesc': 'l1',
                'LevelTwoDesc': 'l2',
                'LevelThreeDesc': 'l3',
                'Count': 5
            }
        ]
        inserted_df = spark.createDataFrame(sample_data, schema)
        inserted_df = SACrimeRecordWrangler.cleanse_df(inserted_df)

        delta_table = DeltaTable.forName(spark, self.delta_table_name)
        SACrimeRecordWrangler.upsert_delta_table(delta_table, inserted_df)

        fetched_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert fetched_df.count() == 1, f'one record is inserted, but returned {fetched_df.count()}'

        inserted_row = fetched_df.first()
        assert inserted_row['Count'] == 5, f"expected integer 5, but returned with '{inserted_row}'"

        upsert_data = [
            {
                'ReportedDate': '5/12/2023',
                'Suburb': 's',
                'Postcode': '12',
                'LevelOneDesc': 'l1',
                'LevelTwoDesc': 'l2',
                'LevelThreeDesc': 'l3',
                'Count': 7
            }
        ]

        upsert_df = spark.createDataFrame(upsert_data, schema)
        upsert_df = SACrimeRecordWrangler.cleanse_df(upsert_df)

        SACrimeRecordWrangler.upsert_delta_table(delta_table, upsert_df)

        upserted_df = spark.sql(f'SELECT * FROM {self.delta_table_name}')
        assert upserted_df.count() == 1, f'one record is updated, but returned {upserted_df.count()}'

        upserted_row = upserted_df.first()
        assert upserted_row['Count'] == 7, f"expected integer 7, but returned with '{upserted_row}'"
        assert upserted_row['UpdatedTS'] > inserted_row['UpdatedTS'], f"TS was updated, but returned with '{upserted_row}'"

        spark.sql(f'DELETE FROM {self.delta_table_name}')


    @classmethod
    def tearDownClass(cls):
        cls.spark.sql(f'DROP TABLE IF EXISTS {cls.delta_table_name}')
        cls.spark.stop()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 4, Finished, Available, Finished)



In [3]:
test_case = TestSACrimeRecordWrangler()
TestSACrimeRecordWrangler.setUpClass()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 5, Finished, Available, Finished)

In [4]:
test_case.test_remove_all_na()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 6, Finished, Available, Finished)

In [5]:
test_case.test_reported_date_str_to_date()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 7, Finished, Available, Finished)

In [6]:
test_case.test_postcode_str_to_short()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 8, Finished, Available, Finished)

In [7]:
test_case.test_suburb_fill_null()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 9, Finished, Available, Finished)

In [8]:
test_case.test_suburb_fill_empty()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 10, Finished, Available, Finished)

In [9]:
test_case.test_postcode_fill_null()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 11, Finished, Available, Finished)

In [10]:
test_case.test_cleanse_df()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 12, Finished, Available, Finished)

In [11]:
test_case.test_insert_empty_delta_table()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 13, Finished, Available, Finished)

In [12]:
test_case.test_insert_non_empty_delta_table()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 14, Finished, Available, Finished)

In [13]:
test_case.test_upsert_delta_table()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 15, Finished, Available, Finished)

In [14]:
TestSACrimeRecordWrangler.tearDownClass()

StatementMeta(, 08fbe4f6-c3e6-41ed-9265-576bd3bf7808, 16, Finished, Available, Finished)