<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://blog.scholarnest.com/wp-content/uploads/2023/03/scholarnest-academy-scaled.jpg" alt="ScholarNest Academy" style="width: 1400px">
</div>

#####Cleanup previous runs

In [0]:
%run ../utils/cleanup

#####Setup

In [0]:
base_dir = "/mnt/files/dataset_ch8"
spark.sql("CREATE CATALOG IF NOT EXISTS dev")
spark.sql("CREATE DATABASE IF NOT EXISTS dev.demo_db")

#####1. Verify you can access the invoices directory

In [0]:
%fs ls /mnt/files/dataset_ch8/invoices

#####2. Ingest data into invoices_raw table using spark streaming api

In [0]:
def ingest():
  source_df = (spark.readStream
                      .format("cloudFiles")
                      .option("cloudFiles.format", "csv")  
                      .option("header", "true") 
                      .option("timestampFormat","d-M-y H.m")                  
                      .option("cloudFiles.schemaLocation", f"{base_dir}/chekpoint/invoices_schema")
                      .option("cloudFiles.inferColumnTypes", "true")
                      .option("cloudFiles.schemaHints", "InvoiceNo string, CustomerID string")
                      .load(f"{base_dir}/invoices")
  )

  write_query = (source_df.writeStream
                          .format("delta")
                          .option("checkpointLocation", f"{base_dir}/chekpoint/invoices")
                          .option("mergeSchema", "true")
                          .outputMode("append")                          
                          .trigger(availableNow = True)
                          .toTable("dev.demo_db.invoices_raw")
  )

ingest() 

#####3. Check the records after ingestion

In [0]:
%sql
SELECT * FROM dev.demo_db.invoices_raw

In [0]:
%sql
DESCRIBE dev.demo_db.invoices_raw

#####4. Ingest some more data into the invoices directory which comes with an additional column

In [0]:
%fs cp /mnt/files/dataset_ch8/invoices_2021.csv /mnt/files/dataset_ch8/invoices/

#####5. Ingest with a retry

In [0]:
ingest()

#####6. Check the data 

In [0]:
%sql
SELECT * FROM dev.demo_db.invoices_raw

In [0]:
%sql
DESCRIBE dev.demo_db.invoices_raw

#####7. Ingest some more records with potential bad records

In [0]:
%fs cp /mnt/files/dataset_ch8/invoices_2022.csv /mnt/files/dataset_ch8/invoices/

In [0]:
ingest()

#####8. Check the rescued data

In [0]:
%sql
SELECT * FROM dev.demo_db.invoices_raw where _rescued_data is not null

&copy; 2021-2023 ScholarNest Technologies Pvt. Ltd. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>
Databricks, Databricks Cloud and the Databricks logo are trademarks of the <a href="https://www.databricks.com/">Databricks Inc</a>.<br/>
<br/>
<a href="https://www.scholarnest.com/privacy/">Privacy Policy</a> | 
<a href="https://www.scholarnest.com/terms/">Terms of Use</a> | <a href="https://www.scholarnest.com/contact/">Contact Us</a>