In [0]:
# Reloading October data:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    schema=schema
)

In [0]:

# Choosing Delta location inside volume:
delta_path = "/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct"

events.write.format("delta") \
    .mode("overwrite") \
    .save(delta_path)
     

In [0]:
%fs ls /Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct

path,name,size,modificationTime
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/_delta_log/,_delta_log/,0,1768232826090
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00000-4092837b-1778-443b-9598-0be5a3d6b84e.c000.snappy.parquet,part-00000-4092837b-1778-443b-9598-0be5a3d6b84e.c000.snappy.parquet,34095145,1768232651000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00001-ebc0f680-e499-4b6a-9b9b-665bc3bba1e0.c000.snappy.parquet,part-00001-ebc0f680-e499-4b6a-9b9b-665bc3bba1e0.c000.snappy.parquet,31651988,1768232656000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00002-403426c3-9112-41f4-91ee-c2d22d88a544.c000.snappy.parquet,part-00002-403426c3-9112-41f4-91ee-c2d22d88a544.c000.snappy.parquet,31833415,1768232652000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00003-4f30947d-db38-411b-8b3a-833a83695ec3.c000.snappy.parquet,part-00003-4f30947d-db38-411b-8b3a-833a83695ec3.c000.snappy.parquet,32537240,1768232651000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00004-559f0d83-9811-47e6-a09e-363239a72868.c000.snappy.parquet,part-00004-559f0d83-9811-47e6-a09e-363239a72868.c000.snappy.parquet,33620729,1768232651000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00005-ff5ffd07-3806-4016-952f-d3a0ba40619a.c000.snappy.parquet,part-00005-ff5ffd07-3806-4016-952f-d3a0ba40619a.c000.snappy.parquet,33562202,1768232652000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00006-f74c87ec-4d0d-46ce-b2ea-0dee572ae1cd.c000.snappy.parquet,part-00006-f74c87ec-4d0d-46ce-b2ea-0dee572ae1cd.c000.snappy.parquet,31716159,1768232656000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00007-58291c06-e42a-4b14-b855-0c281202fa39.c000.snappy.parquet,part-00007-58291c06-e42a-4b14-b855-0c281202fa39.c000.snappy.parquet,32931407,1768232655000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/events_oct/part-00008-1a36474d-cb8e-4996-9f40-4ce01a04e4fe.c000.snappy.parquet,part-00008-1a36474d-cb8e-4996-9f40-4ce01a04e4fe.c000.snappy.parquet,32967143,1768232657000


In [0]:
# creating managed delta table
events.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_table")

In [0]:
# checking the table
spark.sql("SELECT COUNT(*) FROM events_table").show()

+--------+
|COUNT(*)|
+--------+
|42448764|
+--------+



In [0]:
%sql
SELECT current_catalog(), current_schema();

current_catalog(),current_schema()
workspace,default


In [0]:
%sql
CREATE OR REPLACE TABLE events_delta
USING DELTA
AS
SELECT *
FROM workspace.default.events_table;

num_affected_rows,num_inserted_rows


In [0]:
%sql
SHOW TABLES;

database,tableName,isTemporary
default,events_delta,False
default,events_table,False


In [0]:
try:
    wrong_schema = spark.createDataFrame(
        [("a", "b", "c")],
        ["x", "y", "z"]
    )
    
    wrong_schema.write.format("delta") \
        .mode("append") \
        .save(delta_path)

except Exception as e:
    print("Schema enforcement triggered:")
    print(e)
     

Schema enforcement triggered:
name 'delta_path' is not defined


In [0]:
# checking duplicate entry
events.write.format("delta") \
    .mode("append") \
    .save(delta_path)

print("After first append:", spark.read.format("delta").load(delta_path).count())

events.write.format("delta") \
    .mode("append") \
    .save(delta_path)

print("After second append:", spark.read.format("delta").load(delta_path).count())

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-5578529009886047>, line 2[0m
[1;32m      1[0m [38;5;66;03m# checking duplicate entry[39;00m
[0;32m----> 2[0m events[38;5;241m.[39mwrite[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m) \
[1;32m      3[0m     [38;5;241m.[39mmode([38;5;124m"[39m[38;5;124mappend[39m[38;5;124m"[39m) \
[1;32m      4[0m     [38;5;241m.[39msave(delta_path)
[1;32m      6[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mAfter first append:[39m[38;5;124m"[39m, spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39mload(delta_path)[38;5;241m.[39mcount())
[1;32m      8[0m events[38;5;241m.[39mwrite[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m) \
[1;32m   