<div style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="https://blog.scholarnest.com/wp-content/uploads/2023/03/scholarnest-academy-scaled.jpg" alt="ScholarNest Academy" style="width: 1400px">
</div>

#####Cleanup previous runs

In [0]:
%run ../utils/cleanup

#####Setup

In [0]:
%python
CL = Cleanup()
def setup():
    spark.sql("CREATE CATALOG IF NOT EXISTS dev")
    spark.sql("CREATE DATABASE IF NOT EXISTS dev.demo_db")
    spark.sql("""CREATE OR REPLACE TABLE dev.demo_db.people_tbl(
                        id INT,
                        firstName STRING,
                        lastName STRING
                        ) USING DELTA""")
    spark.sql("""INSERT INTO dev.demo_db.people_tbl
                    SELECT id, fname, lname
                    FROM json.`/mnt/files/dataset_ch7/people.json`""")    

spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "false")    
setup()
spark.sql("select * from dev.demo_db.people_tbl").display()

#####Schema Validations Summary
1. INSERT &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&ensp;&nbsp;- Column matching by position, New columns not allowed
2. OVERWRITE &emsp;&emsp;&emsp;&emsp;&ensp;- Column matching by position, New columns not allowed
3. MERGE INSERT &emsp;&emsp;&emsp;&nbsp;- Column matching by name, New columns ignored
4. DataFrame Append &emsp;&nbsp;- Column matching by name, New columns not allowed
5. Data Type Mismatch &emsp;- Not allowed in any case
#####Schema evolution approaches
1. Manual&emsp;&nbsp; - New columns
2. Automatic - New columns

#####1. Manual schema evolution - New column at the end

In [0]:
%sql
ALTER TABLE dev.demo_db.people_tbl ADD COLUMNS (birthDate STRING);

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname firstName, lname lastName, dob birthDate
FROM json.`/mnt/files/dataset_ch7/people.json`

#####2. Manual schema evolution - New column in the middle

In [0]:
%sql
ALTER TABLE dev.demo_db.people_tbl ADD COLUMNS (phoneNumber STRING after lastName);

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname firstName, lname lastName, phone phoneNumber, dob birthDate
FROM json.`/mnt/files/dataset_ch7/people_2.json`

In [0]:
%sql
select * from dev.demo_db.people_tbl

####Cleanup and Setup for Automatic Schema Evolution

In [0]:
%python
CL.cleanup() 
setup()
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "false")
spark.sql("select * from dev.demo_db.people_tbl").display()

#####Automatic Schema Evolution - At Session level

In [0]:
%sql
SET spark.databricks.delta.schema.autoMerge.enabled = true

#####3. Automatic schema evolution - New column at the end

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname firstName, lname lastName, dob birthDate
FROM json.`/mnt/files/dataset_ch7/people_2.json` 

In [0]:
%sql
select * from dev.demo_db.people_tbl

#####4. Automatic schema evolution - New column in the middle
For INSERT 
1. Either it doesn't work because of the column matching by position
2. Or it corrupts your data

In [0]:
%sql
INSERT INTO dev.demo_db.people_tbl
SELECT id, fname firstName, lname lastName, phone phoneNumber, dob birthDate
FROM json.`/mnt/files/dataset_ch7/people_2.json`

#####5. Automatic schema evolution - New column in the middle
Works with MERGE INSERT

In [0]:
%sql
MERGE INTO dev.demo_db.people_tbl tgt
USING (SELECT id, fname firstName, lname lastName, phone phoneNumber, dob birthDate FROM json.`/mnt/files/dataset_ch7/people_3.json`) src
ON tgt.id = src.id
WHEN NOT MATCHED THEN
    INSERT *

In [0]:
%sql
select * from dev.demo_db.people_tbl

####Cleanup and Setup for Automatic Schema Evolution at Table level

In [0]:
%python
CL.cleanup() 
setup()
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "false")
spark.sql("select * from dev.demo_db.people_tbl").display()

#####6. Schema evolution - New column at the end

In [0]:
%python
from pyspark.sql.functions import to_date

people_2_schema = "id INT, fname STRING, lname STRING, dob STRING"

people_2_df =  (spark.read.format("json").schema(people_2_schema)
                    .load("/mnt/files/dataset_ch7/people_2.json")
                    .toDF("id", "firstName", "lastName", "birthDate"))

(people_2_df.write
      .format("delta")
      .mode("append")
      .option("mergeSchema", "true")
      .saveAsTable("dev.demo_db.people_tbl")
)

In [0]:
%sql
select * from dev.demo_db.people_tbl

#####5. Automatic schema evolution - New column in the middle

In [0]:
%python
from pyspark.sql.functions import to_date

people_3_schema = "id INT, fname STRING, lname STRING, phone STRING, dob STRING"

people_3_df =  (spark.read.format("json").schema(people_3_schema)
                    .load("/mnt/files/dataset_ch7/people_3.json")
                    .toDF("id", "firstName", "lastName", "phoneNumber", "birthDate"))

(people_3_df.write
      .format("delta")
      .mode("append")
      .option("mergeSchema", "true")
      .saveAsTable("dev.demo_db.people_tbl")
)

In [0]:
%sql
select * from dev.demo_db.people_tbl

&copy; 2021-2023 ScholarNest Technologies Pvt. Ltd. All rights reserved.<br/>
Apache, Apache Spark, Spark and the Spark logo are trademarks of the <a href="https://www.apache.org/">Apache Software Foundation</a>.<br/>
Databricks, Databricks Cloud and the Databricks logo are trademarks of the <a href="https://www.databricks.com/">Databricks Inc</a>.<br/>
<br/>
<a href="https://www.scholarnest.com/privacy/">Privacy Policy</a> | 
<a href="https://www.scholarnest.com/terms/">Terms of Use</a> | <a href="https://www.scholarnest.com/contact/">Contact Us</a>