In [10]:
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [17]:
builder = (
    SparkSession
    .builder
    .master("spark://spark-master:7077")
    .appName("rawApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [18]:
df = spark.read.format('delta').load('/opt/workspace/data/cleaned/iris/')

In [19]:
df = (
    df
    .groupBy('Species')
    .agg({
        'SepalLengthCm': 'avg', 
        'SepalWidthCm': 'avg', 
        'PetalLengthCm': 'avg', 
        'PetalWidthCm': 'avg'}
    )
    .withColumn('avg_petal_widhcm', col('avg(PetalWidthCm)').cast('decimal(12, 2)'))
    .withColumn('avg_sepal_widhcm', col('avg(SepalWidthCm)').cast('decimal(12, 2)'))
    .withColumn('avg_petal_lengthcm', col('avg(PetalLengthCm)').cast('decimal(12, 2)'))
    .withColumn('avg_sepal_lengthcm', col('avg(SepalLengthCm)').cast('decimal(12, 2)'))
    .select(
        'Species',
        'avg_petal_widhcm',
        'avg_sepal_widhcm',
        'avg_petal_lengthcm',
        'avg_sepal_lengthcm'
    )
)

In [20]:
df.show()

+---------------+----------------+----------------+------------------+------------------+
|        Species|avg_petal_widhcm|avg_sepal_widhcm|avg_petal_lengthcm|avg_sepal_lengthcm|
+---------------+----------------+----------------+------------------+------------------+
| Iris-virginica|            2.03|            2.97|              5.55|              6.59|
|    Iris-setosa|            0.24|            3.42|              1.46|              5.01|
|Iris-versicolor|            1.33|            2.77|              4.26|              5.94|
+---------------+----------------+----------------+------------------+------------------+



In [22]:
df.write.option("mergeSchema", "true").mode('overwrite').format('delta').save('/opt/workspace/data/curated/iris/')

In [23]:
spark.stop()