# PySpark and Delta Lake Data Processing

This notebook lists common commands for working with data using PySpark and Delta Lake.

## 1. Initialize Spark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('delta-demo').getOrCreate()

## 2. Read Data from CSV

In [None]:
df = spark.read.option('header', True).csv('/path/to/input.csv')

## 3. Write Data as Delta

In [None]:
df.write.format('delta').mode('overwrite').save('/path/to/delta-table')

## 4. Read a Delta Table

In [None]:
delta_df = spark.read.format('delta').load('/path/to/delta-table')

## 5. Transformations

In [None]:
result_df = (
    delta_df.filter("status = 'ACTIVE'")
            .groupBy('category')
            .count()
)

## 6. Save Table to the Metastore

In [None]:
result_df.write.format('delta').mode('overwrite').saveAsTable('catalog.schema.output_table')

## 7. Upsert (MERGE) Data

In [None]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, '/path/to/delta-table')

(delta_table.alias('t')
 .merge(source=result_df.alias('s'), condition='t.id = s.id')
 .whenMatchedUpdateAll()
 .whenNotMatchedInsertAll()
 .execute())

## 8. Optimize and Vacuum

In [None]:
spark.sql('OPTIMIZE delta.`/path/to/delta-table`')

delta_table.vacuum(retentionHours=168)