# S1 J5 ? Delta Lake (Write + Read)

This notebook writes a Delta table and reads it back.

If you are running locally and Delta is missing, install it first:
- `pip install delta-spark`


In [1]:
pip install delta-spark

Collecting delta-spark
  Downloading delta_spark-4.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting importlib-metadata>=1.0.0 (from delta-spark)
  Downloading importlib_metadata-8.7.1-py3-none-any.whl.metadata (4.7 kB)
Collecting zipp>=3.20 (from importlib-metadata>=1.0.0->delta-spark)
  Downloading zipp-3.23.0-py3-none-any.whl.metadata (3.6 kB)
Downloading delta_spark-4.0.1-py3-none-any.whl (43 kB)
Downloading importlib_metadata-8.7.1-py3-none-any.whl (27 kB)
Downloading zipp-3.23.0-py3-none-any.whl (10 kB)
Installing collected packages: zipp, importlib-metadata, delta-spark
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [delta-spark]
[1A[2KSuccessfully installed delta-spark-4.0.1 importlib-metadata-8.7.1 zipp-3.23.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# spark.stop()


builder = (
    SparkSession.builder
    .appName("delta-lake-demo")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()



In [12]:
from pyspark.sql import functions as F

data_path = "../../data/example.csv"
delta_path = "../../data/delta/users"

raw = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(data_path)
)

silver = (
    raw
    .withColumn("signup_date", F.to_date("signup_date"))
    .withColumn("spend", F.col("spend").cast("double"))
)


In [13]:
# Write Delta table
(
    silver
    .write
    .format("delta")
    .mode("overwrite")
    .save(delta_path)
)


Py4JJavaError: An error occurred while calling o142.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Make sure the provider name is correct and the package is properly registered and compatible with your Spark version. SQLSTATE: 42K02
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:764)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:686)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:745)
	at org.apache.spark.sql.classic.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:596)
	at org.apache.spark.sql.classic.DataFrameWriter.saveCommand(DataFrameWriter.scala:141)
	at org.apache.spark.sql.classic.DataFrameWriter.save(DataFrameWriter.scala:115)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$6(DataSource.scala:670)
	at scala.util.Try$.apply(Try.scala:217)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:670)
	at scala.util.Failure.orElse(Try.scala:230)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:670)
	... 16 more


In [None]:
# Read Delta table
delta_df = spark.read.format("delta").load(delta_path)
delta_df.show(truncate=False)


In [None]:
# Register as a table for SQL queries (optional)
spark.sql(f"DROP TABLE IF EXISTS users_delta")
spark.sql(f"CREATE TABLE users_delta USING DELTA LOCATION '{delta_path}'")

spark.sql("SELECT plan, COUNT(*) AS users, ROUND(SUM(spend), 2) AS total_spend FROM users_delta GROUP BY plan").show()
