In [0]:
# dbutils.fs.mkdirs("dbfs:/tables")
# dbutils.fs.rm('dbfs:/tables',True)
# dbutils.fs.rm('dbfs:/tables/delta', recurse=True)  #--more effective than drop table
# spark.sql("drop table if exists deltatable1")

# dbutils.fs.mv("dbfs:/FileStore/mock_data_changed.parquet", "dbfs:/mockarrow/")        # Moves your file from one folder to another
# dbutils.fs.mv("dbfs:/datasets/MOCK_DATA.csv", "dbfs:/mockarrow/MOCK_DATA.csv", True)  # To rename a folder (3 parameters*)

## import libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
spark

## load parquet file

In [0]:
df_parqq = spark.read.parquet("dbfs:/mockarrow/mock_data_changed.parquet")
display(df_parqq.limit(5))

Emp ID,Emp Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,6/27/2021,$709117.57,True,2
2,Hillier,+7 511 334 2980,Male,Russia,8/20/2022,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,11/30/2022,$78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,12/26/2022,$641503.36,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,3/24/2022,,,2


In [0]:
df_parqq = df_parqq.withColumn("DOJ", to_date("DOJ", "M/d/yyyy"))
display(df_parqq.limit(5))

Emp ID,Emp Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,$709117.57,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,2022-11-30,$78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,2022-12-26,$641503.36,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,2022-03-24,,,2


In [0]:
df_parqq1 = df_parqq.withColumn("salary", regexp_replace("salary", "\$", "")).withColumn('salary', col('salary').cast(FloatType()))
display(df_parqq1.limit(5))

Emp ID,Emp Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,709117.56,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,2022-11-30,78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,2022-12-26,641503.4,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,2022-03-24,,,2


In [0]:
df_parqq1.printSchema()

root
 |-- Emp ID: long (nullable = true)
 |-- Emp Name: string (nullable = true)
 |-- contract: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- DOJ: date (nullable = true)
 |-- salary: float (nullable = true)
 |-- Relocation: boolean (nullable = true)
 |-- Gen_Repr: integer (nullable = true)



Replacing spaces with uderscore in between column names as delta table doesn't support

In [0]:
# df_parqq.withColumnRenamed("Emp ID", "EmpID").withColumnRenamed('Emp Name', "EmpName")
# lst = []
# for cols in df_parqq.columns:
#     if len(cols.split())>1:
#         res = ''.join(cols.split())
#         lst.append(res)
#     else:
#         lst.append(cols)
# lst


for colname in df_parqq1.columns:
    if " " in colname:
        df_parqq1 = df_parqq1.withColumnRenamed(colname, colname.replace(" ", "_")) # one-by-one iteration
        
display(df_parqq1.limit(5))

Emp_ID,Emp_Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,709117.56,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,2022-11-30,78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,2022-12-26,641503.4,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,2022-03-24,,,2


## save as delta table

In [0]:
# spark.sql("DROP TABLE IF EXISTS deltatable1")

# If the table already exists in the default database, you must ensure that the .option('path', ...) is not specified, as Delta tables in a Hive Metastore will automatically use their registered location.

# dbutils.fs.rm("dbfs:/mockarrow/tables",True)

Use mergeSchema = True so that delta table can handle additional changes to schema

In [0]:
df_parqq1.write.mode('overwrite').option('mergeSchema',True).option('path',"dbfs:/tables/delta/").saveAsTable('deltatable1')

In [0]:
%sql
select * from deltatable1
limit 5

Emp_ID,Emp_Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,709117.56,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,2022-11-30,78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,2022-12-26,641503.4,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,2022-03-24,,,2


In [0]:
df_sql = spark.sql("Describe history deltatable1")
display(df_sql)
df_sql.select('UserName','Version', 'Operation').show(truncate=False)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2025-03-05T10:53:31.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,0.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12
0,2025-03-05T10:48:18.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12


+------------------------+-------+---------------------------------+
|UserName                |Version|Operation                        |
+------------------------+-------+---------------------------------+
|jithinvyas2001@gmail.com|1      |CREATE OR REPLACE TABLE AS SELECT|
|jithinvyas2001@gmail.com|0      |CREATE OR REPLACE TABLE AS SELECT|
+------------------------+-------+---------------------------------+



**version-1**

In [0]:
%sql
update deltatable1
set Emp_ID = 1011
where Emp_ID = 1

num_affected_rows
1


**version-2**

In [0]:
%sql
delete from deltatable1
where Relocation is Null

num_affected_rows
32


In [0]:
%sql
select count(*) as TotalRows from deltatable1

TotalRows
968


**Version history**

In [0]:
df_sql = spark.sql("Describe history deltatable1")
display(df_sql)
df_sql.select('UserName','Version', 'Operation', 'OperationParameters').show(truncate=False)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-03-05T10:53:43.000+0000,2187969817410935,jithinvyas2001@gmail.com,DELETE,"Map(predicate -> [""isnull(Relocation#9021)""])",,List(841606920512230),0305-103149-hic0xa2v,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 38925, numCopiedRows -> 968, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1921, numDeletedRows -> 32, scanTimeMs -> 1036, numAddedFiles -> 1, numAddedBytes -> 37342, rewriteTimeMs -> 884)",,Databricks-Runtime/12.2.x-scala2.12
2,2025-03-05T10:53:39.000+0000,2187969817410935,jithinvyas2001@gmail.com,UPDATE,"Map(predicate -> [""(Emp_ID#8357L = 1)""])",,List(841606920512230),0305-103149-hic0xa2v,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 38919, numCopiedRows -> 999, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1473, scanTimeMs -> 559, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 38925, rewriteTimeMs -> 913)",,Databricks-Runtime/12.2.x-scala2.12
1,2025-03-05T10:53:31.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,0.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12
0,2025-03-05T10:48:18.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12


+------------------------+-------+---------------------------------+------------------------------------------------------------------------------+
|UserName                |Version|Operation                        |OperationParameters                                                           |
+------------------------+-------+---------------------------------+------------------------------------------------------------------------------+
|jithinvyas2001@gmail.com|3      |DELETE                           |{predicate -> ["isnull(Relocation#9021)"]}                                    |
|jithinvyas2001@gmail.com|2      |UPDATE                           |{predicate -> ["(Emp_ID#8357L = 1)"]}                                         |
|jithinvyas2001@gmail.com|1      |CREATE OR REPLACE TABLE AS SELECT|{isManaged -> false, description -> null, partitionBy -> [], properties -> {}}|
|jithinvyas2001@gmail.com|0      |CREATE OR REPLACE TABLE AS SELECT|{isManaged -> false, description -> null, pa

if delete step was a mistake and **version-1** is necessary now. Just see it

In [0]:
%sql
select count(*) as TotalRows from deltatable1
version as of 1

TotalRows
1000


**Restore** previous version

In [0]:
%sql
restore table deltatable1
version as of 1

table_size_after_restore,num_of_files_after_restore,num_removed_files,num_restored_files,removed_files_size,restored_files_size
38919,1,1,1,37342,38919


verify results

In [0]:
%sql
select count(*) as TotalRows from deltatable1

TotalRows
1000


to know what has been done so far, have a look on logs table

In [0]:
df_sql = spark.sql("Describe history deltatable1")
display(df_sql)
df_sql.select('UserName','Version', 'Operation', 'OperationParameters').show(truncate=False)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2025-03-05T10:54:14.000+0000,2187969817410935,jithinvyas2001@gmail.com,RESTORE,"Map(version -> 1, timestamp -> null)",,List(841606920512230),0305-103149-hic0xa2v,3.0,Serializable,False,"Map(numRestoredFiles -> 1, removedFilesSize -> 37342, numRemovedFiles -> 1, restoredFilesSize -> 38919, numOfFilesAfterRestore -> 1, tableSizeAfterRestore -> 38919)",,Databricks-Runtime/12.2.x-scala2.12
3,2025-03-05T10:53:43.000+0000,2187969817410935,jithinvyas2001@gmail.com,DELETE,"Map(predicate -> [""isnull(Relocation#9021)""])",,List(841606920512230),0305-103149-hic0xa2v,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 38925, numCopiedRows -> 968, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1921, numDeletedRows -> 32, scanTimeMs -> 1036, numAddedFiles -> 1, numAddedBytes -> 37342, rewriteTimeMs -> 884)",,Databricks-Runtime/12.2.x-scala2.12
2,2025-03-05T10:53:39.000+0000,2187969817410935,jithinvyas2001@gmail.com,UPDATE,"Map(predicate -> [""(Emp_ID#8357L = 1)""])",,List(841606920512230),0305-103149-hic0xa2v,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 38919, numCopiedRows -> 999, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1473, scanTimeMs -> 559, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 38925, rewriteTimeMs -> 913)",,Databricks-Runtime/12.2.x-scala2.12
1,2025-03-05T10:53:31.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,0.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12
0,2025-03-05T10:48:18.000+0000,2187969817410935,jithinvyas2001@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(isManaged -> false, description -> null, partitionBy -> [], properties -> {})",,List(841606920512230),0305-103149-hic0xa2v,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 1000, numOutputBytes -> 38919)",,Databricks-Runtime/12.2.x-scala2.12


+------------------------+-------+---------------------------------+------------------------------------------------------------------------------+
|UserName                |Version|Operation                        |OperationParameters                                                           |
+------------------------+-------+---------------------------------+------------------------------------------------------------------------------+
|jithinvyas2001@gmail.com|4      |RESTORE                          |{version -> 1, timestamp -> null}                                             |
|jithinvyas2001@gmail.com|3      |DELETE                           |{predicate -> ["isnull(Relocation#9021)"]}                                    |
|jithinvyas2001@gmail.com|2      |UPDATE                           |{predicate -> ["(Emp_ID#8357L = 1)"]}                                         |
|jithinvyas2001@gmail.com|1      |CREATE OR REPLACE TABLE AS SELECT|{isManaged -> false, description -> null, pa

In [0]:
%sql
select * from deltatable1
limit 5

Emp_ID,Emp_Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,709117.56,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
3,Sayre,+57 983 888 4293,Female,Colombia,2022-11-30,78402.39,True,2
4,Roley,+95 337 327 0628,Male,Myanmar,2022-12-26,641503.4,True,5
5,Merrielle,+62 862 968 5847,Female,Indonesia,2022-03-24,,,2


In [0]:
display(spark.sql("DESCRIBE FORMATTED deltatable1"))

col_name,data_type,comment
Emp_ID,bigint,
Emp_Name,string,
contract,string,
gender,string,
Country,string,
DOJ,date,
salary,float,
Relocation,boolean,
Gen_Repr,int,
,,


In [0]:
display(spark.table("default.deltatable1").limit(2))

Emp_ID,Emp_Name,contract,gender,Country,DOJ,salary,Relocation,Gen_Repr
1,Kara,+853 410 196 7317,Female,Macao,2021-06-27,709117.56,True,2
2,Hillier,+7 511 334 2980,Male,Russia,2022-08-20,,False,5
