In [1]:
from utils.spark_utils import get_spark_session, get_config, enrich_df, basic_merge
from pyspark.sql.types import StructType, StructField, StringType, DateType, TimestampType

spark = get_spark_session()
config = get_config()
DATA_SOURCE = "cows/"

# Bronze to silver

In [2]:

source_path = "../" + config["bronze_path"] + DATA_SOURCE
target_path = "../" + config["silver_path"] + DATA_SOURCE
checkpoint_location = "../" + config["bronze_path"] + f"checkpoints/{DATA_SOURCE}/"

silver_schema = StructType([
                StructField("id", StringType(), nullable=False),
                StructField("name", StringType(), nullable=True),
                StructField("birthdate", DateType(), nullable=False),
                StructField("ingestion_date", TimestampType(), nullable=False)
            ])

merge_columns = ["id"]

bronze_df = (
    spark.readStream.format("delta").load(source_path)
    .transform(enrich_df)
)

query = (
    bronze_df
    .writeStream
    .outputMode("append")
    .option("checkpointLocation", checkpoint_location)
    .foreachBatch(
        lambda df, _: 
        basic_merge(batch_df=df, batch_id=_,unique_columns=merge_columns,spark=spark, target_path=target_path,target_schema=silver_schema)
    )
    .trigger(once=True)
    .start()
)

query.awaitTermination()