In [0]:
import pandas as pd

# Create sample sales data
sales_data = {
    "OrderID": [1, 2, 3, 4],
    "OrderDate": ["2024-01-01 10:00:00", "2024-01-02 11:00:00", "2024-01-03 12:00:00", "2024-01-04 13:00:00"],
    "CustomerID": ["0001", "0002", "0003", "0004"],
    "Product": ["Product1", "Product2", "Product3", "Product4"],
    "Quantity": [10, 20, 15, 5],
    "Price": [100.0, 200.0, 158.0, 50.0]
}

# Convert to DataFrame
df_sales = pd.DataFrame(sales_data)

# Save as CSV
csv_path = "/Workspace/sales_data"
df_sales.to_csv(csv_path, index=False)

# Save as Parquet
parquet_path = "/Workspace/sales_data.parquet"
df_sales.to_parquet(parquet_path, index=False)

print(f"Sample data saved to {csv_path} and {parquet_path}")


Sample data saved to /Workspace/sales_data and /Workspace/sales_data.parquet


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DeltaExample") \
    .getOrCreate()

# Load data from CSV
df_sales = spark.read.format("csv") \
    .option("header", "true") \
    .load("file:/Workspace/sales_data")

# Transform the data: Add a new column for total amount
df_transformed = df_sales.withColumn("TotalAmount", col("Quantity").cast("int") * col("Price").cast("double"))

# Write transformed data to a Delta table
delta_table_path = "/delta/sales_data"
df_transformed.write.format("delta") \
    .mode("overwrite") \
    .save(delta_table_path)

print("Delta table created and data written successfully.")


Delta table created and data written successfully.


**DELTA LIVE TABLES**

In [0]:
import dlt
from pyspark.sql.functions import col

@dlt.table
def sales_data():
    # Load data from Delta table
    df = spark.read.format("delta").load("/delta/sales_data")

    # Select columns and compute TotalAmount
    df_transformed = df.select(
        col("OrderID"),
        col("OrderDate"),
        col("CustomerID"),
        col("Product"),
        col("Quantity"),
        col("Price"),
        (col("Quantity").cast("int") * col("Price").cast("double")).alias("TotalAmount")
    )

    return df_transformed


Name,Type
OrderID,string
OrderDate,string
CustomerID,string
Product,string
Quantity,string
Price,string
TotalAmount,double
