<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://blog.scholarnest.com/wp-content/uploads/2023/03/scholarnest-academy-scaled.jpg" alt="ScholarNest Academy" style="width: 1400px">
</div>

#####Cleanup previous runs

In [0]:
%run ../utils/cleanup

#####Setup

In [0]:
base_dir = "/mnt/files/dataset_ch8"
spark.sql("CREATE CATALOG IF NOT EXISTS dev")
spark.sql("CREATE DATABASE IF NOT EXISTS dev.demo_db")

#####1. Verify you can access the invoices directory

In [0]:
%fs ls /mnt/files/dataset_ch8/invoices

#####2. Create a delta table to ingest invoices data

In [0]:
%sql
CREATE TABLE IF NOT EXISTS dev.demo_db.invoices_raw(
  InvoiceNo int,
  StockCode string,
  Description string,
  Quantity int,
  InvoiceDate timestamp,
  UnitPrice double,
  CustomerID int)

#####3. Ingest data into invoices_raw table using spark streaming api

In [0]:
def ingest():
  invoice_schema = """InvoiceNo int, StockCode string, Description string, Quantity int, 
                    InvoiceDate timestamp, UnitPrice double, CustomerID int"""
                    
  source_df = (spark.readStream
                      .format("csv")
                      .option("header", "true")
                      .schema(invoice_schema)
                      .load(f"{base_dir}/invoices")
  )

  write_query = (source_df.writeStream
                          .format("delta")
                          .option("checkpointLocation", f"{base_dir}/chekpoint/invoices")
                          .outputMode("append")
                          .trigger(availableNow = True)
                          .toTable("dev.demo_db.invoices_raw")
  )

ingest()

#####4. Check the records after ingestion

In [0]:
%sql
SELECT * FROM dev.demo_db.invoices_raw

#####6. Ingest some more data into the invoices directory which comes with an additional column

In [0]:
%fs cp /mnt/files/dataset_ch8/invoices_2021.csv /mnt/files/dataset_ch8/invoices

#####7. Your ingestion code will not break but silently ignore the additional column

######7.1 Alter table to evolve the schema

In [0]:
%sql
ALTER TABLE dev.demo_db.invoices_raw ADD COLUMNS (Country string)

######7.2 Modify streaming ingestion to accomodate shcema changes

In [0]:
def ingest():
  invoice_schema = """InvoiceNo int, StockCode string, Description string, Quantity int, 
                    InvoiceDate timestamp, UnitPrice double, CustomerID int, Country string"""
  source_df = (spark.readStream
                      .format("csv")
                      .option("header", "true")
                      .schema(invoice_schema)
                      .load(f"{base_dir}/invoices")
  )

  write_query = (source_df.writeStream
                          .format("delta")
                          .option("checkpointLocation", f"{base_dir}/chekpoint/invoices")
                          .outputMode("append")
                          .trigger(availableNow = True)
                          .toTable("dev.demo_db.invoices_raw")
  )

ingest()  

#####9. Check the data 

In [0]:
%sql
SELECT * FROM dev.demo_db.invoices_raw

&copy; 2021-2023 ScholarNest Technologies Pvt. Ltd. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>
Databricks, Databricks Cloud and the Databricks logo are trademarks of the <a href="https://www.databricks.com/">Databricks Inc</a>.<br/>
<br/>
<a href="https://www.scholarnest.com/privacy/">Privacy Policy</a> | 
<a href="https://www.scholarnest.com/terms/">Terms of Use</a> | <a href="https://www.scholarnest.com/contact/">Contact Us</a>