<img src="../img/schema_enforcement.png" alt="Schema enforcement" width="750">

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.errors import PySparkTypeError

In [13]:
# Initialize Spark session
spark = SparkSession.builder.appName("StructuringSchemaDefinition").getOrCreate()

# Set log level to ERROR to reduce verbosity
spark.sparkContext.setLogLevel("ERROR")

In [14]:
# Define the schema
schema = StructType([
    StructField('id', IntegerType(), True),
   	StructField('name', StringType(), True),
	StructField('age', IntegerType(), True)
]) 

# Create a DataFrame with the defined schema
data = [
    (1, 'Alice', 28),
    (2, 'Bob', 32)
]

df = spark.createDataFrame(data, schema)
df.show()

# Validate schema
df.printSchema()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 28|
|  2|  Bob| 32|
+---+-----+---+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [15]:
# Lets try another data set with a wrong data type:
#   - The data set has an String for the third element corresponding to the column "age"
#   - catching PySparkTypeError when creating the DataFrame
# This proves that one good way to work on ETLs, its initially using a sample of data to clean the data and setup the
# proper schema to use for later processes, and then when loading such data, enforcing the expected schema to know that
# the data loaded has the expected column types, so in a subsequent ETL after the first clean process is to consume
# an expected schema from a DataCatalog, and then enforce that schema on the data load to double check that the loaded data
# is what's expected. 
data_2 = [
    (1, 'Alice', "28"),
    (2, 'Bob', "32")
]

try:
    df_2 = spark.createDataFrame(data_2, schema)

except PySparkTypeError as py_schema_error:
    print(f"Error while creating the DataFrame: {py_schema_error.message}")

# NOTE: in Exeptions instead of print() the standar is to setup a LOGGER and log by error displaying the message part
# here print() is used just as an example.
# i.e.: "LOG.error("Error while creating the DataFrame: %s", repr(py_schema_error.message))"

Error while creating the DataFrame: `IntegerType()` can not accept object `28` in type `str`.


In [16]:
# Stop Spark Session
spark.stop()