<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://blog.scholarnest.com/wp-content/uploads/2023/03/scholarnest-academy-scaled.jpg" alt="ScholarNest Academy" style="width: 1400px">
</div>

#####Cleanup previous runs

In [0]:
%run ../utils/cleanup

#####Setup

In [0]:
%python
CL = Cleanup()
def setup():
        spark.sql("CREATE CATALOG IF NOT EXISTS dev")
        spark.sql("CREATE DATABASE IF NOT EXISTS dev.demo_db")

        raw_df = (spark.read
                .format("csv")
                .option("header", "true")
                .option("inferSchema","true")
                .load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")
        )

        from pyspark.sql.functions import to_date, to_timestamp, round, year
        staging_df = (raw_df.withColumnRenamed("Call Number", "CallNumber")
                        .withColumnRenamed("Unit ID", "UnitID")
                        .withColumnRenamed("Incident Number", "IncidentNumber")
                        .withColumnRenamed("Call Date", "CallDate")
                        .withColumnRenamed("Watch Date", "WatchDate")
                        .withColumnRenamed("Call Final Disposition", "CallFinalDisposition")
                        .withColumnRenamed("Available DtTm", "AvailableDtTm")
                        .withColumnRenamed("Zipcode of Incident", "Zipcode")
                        .withColumnRenamed("Station Area", "StationArea")
                        .withColumnRenamed("Final Priority", "FinalPriority")
                        .withColumnRenamed("ALS Unit", "ALSUnit")
                        .withColumnRenamed("Call Type Group", "CallTypeGroup")
                        .withColumnRenamed("Unit sequence in call dispatch", "UnitSequenceInCallDispatch")
                        .withColumnRenamed("Fire Prevention District", "FirePreventionDistrict")
                        .withColumnRenamed("Supervisor District", "SupervisorDistrict")
                        .withColumn("CallDate", to_date("CallDate", "MM/dd/yyyy"))
                        .withColumn("WatchDate", to_date("WatchDate", "MM/dd/yyyy"))
                        .withColumn("AvailableDtTm", to_timestamp("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a"))
                        .withColumn("Delay", round("Delay", 2))
                        .withColumn("Year", year("CallDate"))
        )

        (staging_df.write
                .format("delta")
                .mode("overwrite")
                .saveAsTable("dev.demo_db.fire_calls_tbl")
        )
setup()        

####VACUUM utility

#####1. Describe extended table and watchout the table directory in Azure container

In [0]:
%sql
describe extended dev.demo_db.fire_calls_tbl

#####2. Apply some transactions

In [0]:
%sql
delete from dev.demo_db.fire_calls_tbl where CallDate = "2002-01-24"

In [0]:
%sql
update dev.demo_db.fire_calls_tbl set Delay = int(Delay)

#####3. Show table history

In [0]:
%sql
describe history dev.demo_db.fire_calls_tbl

#####4. Query older versiosn

In [0]:
%sql
select * from dev.demo_db.fire_calls_tbl version as of 0 where CallDate = "2002-01-24" 

#####5. Vacuum the table

######5.1 Count the data files from the backend

######5.2 Disable safety check

In [0]:
%sql
SET spark.databricks.delta.retentionDurationCheck.enabled = false

######5.3 Vacuum the table with zero retention

In [0]:
%sql
VACUUM dev.demo_db.fire_calls_tbl RETAIN 0 HOURS DRY RUN

######5.4 Count the data files from the backend

######5.5 Read older version of the table

In [0]:
%sql
select * from dev.demo_db.fire_calls_tbl version as of 0 where CallDate = "2002-01-24" 

####REORG and VACUUM

#####1. Remove some columns from your table

######1.1 Enable column mapping

In [0]:
%sql
ALTER TABLE dev.demo_db.fire_calls_tbl SET TBLPROPERTIES (
  'delta.columnMapping.mode' = 'name',
  'delta.minReaderVersion' = '2',
  'delta.minWriterVersion' = '5');

######1.2 Alter table to remove columns

In [0]:
%sql
ALTER TABLE dev.demo_db.fire_calls_tbl DROP columns(SupervisorDistrict, FirePreventionDistrict)

######1.3 Check history

In [0]:
%sql
describe history dev.demo_db.fire_calls_tbl

#####2. Reorganize your data files 

######2.1 REORG your table

In [0]:
%sql
REORG TABLE dev.demo_db.fire_calls_tbl APPLY(PURGE)

######2.2 Check files from the backend

In [0]:
%sql
VACUUM dev.demo_db.fire_calls_tbl RETAIN 0 HOURS DRY RUN

####Optimize and Zorder
1. OPTIMIZE - Create evenly-balanced data files with respect to their size on disk
2. ZORDER  - Colocate the data by column

######1. OPTIMIZE and ZORDER

In [0]:
%sql
OPTIMIZE dev.demo_db.fire_calls_tbl ZORDER BY (Year, CallDate)

######2. VACUUM

In [0]:
%sql
VACUUM dev.demo_db.fire_calls_tbl RETAIN 0 HOURS

&copy; 2021-2023 ScholarNest Technologies Pvt. Ltd. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>
Databricks, Databricks Cloud and the Databricks logo are trademarks of the <a href="https://www.databricks.com/">Databricks Inc</a>.<br/>
<br/>
<a href="https://www.scholarnest.com/privacy/">Privacy Policy</a> | 
<a href="https://www.scholarnest.com/terms/">Terms of Use</a> | <a href="https://www.scholarnest.com/contact/">Contact Us</a>