In [20]:
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [21]:
builder = SparkSession.builder.master("spark://spark-master:7077").appName("curatedApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [22]:
df = spark.read.format('delta').load('/opt/workspace/data/cleaned/iris/')

In [23]:
df = (
    df
    .groupBy('Species')
    .agg({
        'SepalLengthCm': 'sum', 
        'SepalWidthCm': 'sum', 
        'PetalLengthCm': 'sum', 
        'PetalWidthCm': 'sum'}
    ).withColumn('sum_petal_widhcm', col('sum(PetalWidthCm)').cast('decimal(12, 2)'))
    .withColumn('sum_sepal_widhcm', col('sum(SepalWidthCm)').cast('decimal(12, 2)'))
    .withColumn('sum_petal_lengthcm', col('sum(PetalLengthCm)').cast('decimal(12, 2)'))
    .withColumn('sum_sepal_lengthcm', col('sum(SepalLengthCm)').cast('decimal(12, 2)'))
    .select(
        'sum_petal_widhcm',
        'sum_sepal_widhcm',
        'sum_petal_lengthcm',
        'sum_sepal_lengthcm'
    )
)

In [24]:
df.write.mode('overwrite').format('delta').save('/opt/workspace/data/curated/iris/')

In [25]:
spark.stop()