# Easier unit testing through the creation of empty DataSets from schemas
We provide helper functions to generate (partially) empty DataSets from existing schemas. This can be helpful in certain situations, such as unit testing.

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.Builder().getOrCreate()

In [2]:
from typedspark import Column, Schema, create_empty_dataset, create_partially_filled_dataset
from pyspark.sql.types import LongType, StringType

class Person(Schema):
    id: Column[LongType]
    name: Column[StringType]
    age: Column[LongType]

df_empty = create_empty_dataset(spark, Person)
df_empty.show()

+----+----+----+
|  id|name| age|
+----+----+----+
|null|null|null|
|null|null|null|
|null|null|null|
+----+----+----+



                                                                                

In [3]:
df_partially_filled = create_partially_filled_dataset(
    spark,
    Person,
    {
        Person.id: [1, 2, 3],
        Person.name: ["John", "Jane", "Jack"],
    }
)
df_partially_filled.show()

+---+----+----+
| id|name| age|
+---+----+----+
|  1|John|null|
|  2|Jane|null|
|  3|Jack|null|
+---+----+----+



## Complex datatypes

This trick also works for more complex data types.

In [4]:
from typedspark import ArrayType, MapType, StructType

class A(Schema):
    a: Column[StringType]
    b: Column[StringType]

class B(Schema):
    a: Column[ArrayType[StringType]]
    b: Column[MapType[StringType, StringType]]
    c: Column[StructType[A]]

df_a = create_partially_filled_dataset(
    spark, A, {
        A.a: ["a", "b", "c"]
    }
)

df_b = create_partially_filled_dataset(
    spark, B, {
        B.a: [["a"], ["b", "c"], ["d"]],
        B.b: [{"a": "1"}, {"b": "2", "c": "3"}, {"d": "4"}],
        B.c: df_a.collect(),
    }
)
df_b.show()

+------+----------------+---------+
|     a|               b|        c|
+------+----------------+---------+
|   [a]|        {a -> 1}|{a, null}|
|[b, c]|{b -> 2, c -> 3}|{b, null}|
|   [d]|        {d -> 4}|{c, null}|
+------+----------------+---------+



In [5]:
from datetime import date, datetime
from decimal import Decimal
from pyspark.sql.types import DateType, DecimalType, TimestampType

class A(Schema):
    a: Column[DateType]
    b: Column[TimestampType]
    c: Column[DecimalType]

create_partially_filled_dataset(
    spark, 
    A, 
    {
        A.a: [date(2020, 1, 1)],
        A.b: [datetime(2020, 1, 1, 10, 15)],
        A.c: [Decimal(32)],
    }
).show()

+----------+-------------------+---+
|         a|                  b|  c|
+----------+-------------------+---+
|2020-01-01|2020-01-01 10:15:00| 32|
+----------+-------------------+---+



## Example unit test

The following code snippet shows what a full unit test using typedspark can look like.

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, StringType
from typedspark import Column, DataSet, Schema, create_partially_filled_dataset, transform_to_schema
from chispa.dataframe_comparer import assert_df_equality


class Person(Schema):
    name: Column[StringType]
    age: Column[LongType]


def birthday(df: DataSet[Person]) -> DataSet[Person]:
    return transform_to_schema(df, Person, {Person.age: Person.age + 1})


def test_birthday(spark: SparkSession):
    df = create_partially_filled_dataset(
        spark, Person, {Person.name: ["Alice", "Bob"], Person.age: [20, 30]}
    )

    observed = birthday(df)
    expected = create_partially_filled_dataset(
        spark, Person, {Person.name: ["Alice", "Bob"], Person.age: [21, 31]}
    )

    assert_df_equality(observed, expected, ignore_row_order=True, ignore_nullable=True)
