# ETLの処理例
object storage のparquetファイルをCompute(Spark)でデータ変換し、変換済のデータをADWにロードするサンプルコードです。

In [None]:
# オブジェクトストレージのparquetを読み込み

wine_df = spark.read.format("delta").load(f'/Volumes/object_storage/default/output/predictions')
wine_df.show(5, False)

+-----------+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+----------------+-------------------+------------------------------------------------+----------+
|class_label|alcohol|malic_acid|ash |alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity|hue |od280_od315_of_diluted_wines|proline|features        |rawPrediction      |probability                                     |prediction|
+-----------+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+----------------+-------------------+------------------------------------------------+----------+
|1          |13.05  |1.65      |2.55|18.0             |98       |2.45         |2.43      |0.29                |1.44           |4.25           |1.12|2.5

In [1]:
# 簡単な集計処理(class_label ごとの件数、平均アルコール度数、平均色の濃さを計算)
from pyspark.sql.functions import count, avg

wine_average_df = wine_df.groupBy("class_label") \
    .agg(
        count("*").alias("count"),
        avg("alcohol").alias("avg_alcohol"),
        avg("color_intensity").alias("avg_color_intensity")
    ) \
    .orderBy("class_label")

wine_average_df.show()

+-----------+-----+------------------+-------------------+
|class_label|count|       avg_alcohol|avg_color_intensity|
+-----------+-----+------------------+-------------------+
|          1|   14|13.769285714285713|               5.55|
|          2|    9|12.448888888888888| 2.9066666666666667|
|          3|    6|13.266666666666667|              6.555|
+-----------+-----+------------------+-------------------+



In [1]:
# 事前にADWのカタログを作成しておく

# 集計結果をADWにロード
wine_average_df.write.mode("overwrite").saveAsTable("adw.ksonoda.wine_average")

In [1]:
%sql
select * from adw.ksonoda.wine_average