## groupBy agg

- 원하는 걸로 묶는 것은 groupBy

- groupBy만 실행하면 스파크는 아무일도 안한다. 그냥 대기상태(GroupedData 객체)로 있는다

- 대기중인 그룹들을 실제로 계산하는 것은 agg의 역할

- 계산된결과를 다시 데이터프레임으로 변환해준다

- 스파크는 연산(계산)하는순간 컬럼 이름을 수식그대로 바뀐다 집계 함수도 컬럼명이 바뀌니 꼭 ! Alias사용


In [1]:
from pyspark.sql import (
    Row,
    SparkSession)
import pyspark.sql.functions as F

In [2]:
spark=(
    SparkSession
    .builder
    .appName("groupby_study")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/29 10:43:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df=spark.read.csv(
    "file:///workspace/data/orders_food.csv",
    header=True,
    inferSchema=True
)
df.show()
df.printSchema()

                                                                                

+--------+--------+--------+--------+-----+------+---------+
|order_id|customer|category|quantity|price|rating|   status|
+--------+--------+--------+--------+-----+------+---------+
|       1|     Kim| Chicken|       2|18000|   4.5|completed|
|       2|     Lee|   Pizza|       1|22000|   4.0|completed|
|       3|    Park| Chicken|       3|27000|   5.0|completed|
|       4|    Choi|  Burger|       2|14000|   3.5| canceled|
|       5|    Jung|   Pizza|       2|44000|   4.8|completed|
|       6|     Han|  Burger|       1| 7000|   4.2|completed|
|       7|     Seo| Chicken|       1| 9000|  NULL|completed|
|       8|    Yoon|   Pizza|       3|66000|   4.6|completed|
|       9|    Kang|  Burger|       2|14000|   3.8|completed|
|      10|     Lim| Chicken|       1| 9000|   4.1| canceled|
+--------+--------+--------+--------+-----+------+---------+

root
 |-- order_id: integer (nullable = true)
 |-- customer: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: inte

In [4]:
# group by

In [9]:
df.groupBy("category") # groupedData
# df.groupBy("category").show() 에러다

GroupedData[grouping expressions: [category], value: [order_id: int, customer: string ... 5 more fields], type: GroupBy]

In [13]:
df.groupBy("category").agg(F.sum("quantity")).show()

+--------+-------------+
|category|sum(quantity)|
+--------+-------------+
|  Burger|            5|
| Chicken|            7|
|   Pizza|            6|
+--------+-------------+



In [14]:
df.groupBy("category").agg(F.sum("quantity").alias("total_qty")).show()

+--------+---------+
|category|total_qty|
+--------+---------+
|  Burger|        5|
| Chicken|        7|
|   Pizza|        6|
+--------+---------+



In [15]:
# 여러 집계 한번에 하기

In [16]:
df.groupBy("category").agg(
    F.count("*").alias("order_count"),
    F.sum("quantity").alias("total_quantity"),
    F.sum("price").alias("total_sales"),
    F.avg("rating").alias("avg_rating"),
    F.max("price").alias("max_price")
).show()

+--------+-----------+--------------+-----------+------------------+---------+
|category|order_count|total_quantity|total_sales|        avg_rating|max_price|
+--------+-----------+--------------+-----------+------------------+---------+
|  Burger|          3|             5|      35000|3.8333333333333335|    14000|
| Chicken|          4|             7|      63000| 4.533333333333333|    27000|
|   Pizza|          3|             6|     132000| 4.466666666666667|    66000|
+--------+-----------+--------------+-----------+------------------+---------+



In [17]:
# 취소 주문 제외하고 집계

In [19]:
(
    df
    .filter(F.col("status")=="completed")
    .groupBy("category")
    .agg(
        F.count("*").alias("completed_orders"),
        F.sum("price").alias("completed_sales"),
        F.round(F.avg("rating"),2).alias("avg_rating")
    )
).show()

+--------+----------------+---------------+----------+
|category|completed_orders|completed_sales|avg_rating|
+--------+----------------+---------------+----------+
|  Burger|               2|          21000|       4.0|
| Chicken|               3|          54000|      4.75|
|   Pizza|               3|         132000|      4.47|
+--------+----------------+---------------+----------+



In [22]:
# groupBy() 다중컬럼
# groupBy(A,B) A이면서 동시에 B인 컬럼끼리 묶어라는 뜻
# 다중컬럼일시 결과가 뒤죽박죽 나오기 쉬우므로 orderBy를 붙여줘야지 깔끔하다

In [24]:
# category와 배송상태가 똑같은 것끼리 묶어서 세어보라는 뜻
df.groupBy("category","status").agg(
    F.count("*").alias("order_count")
).show()

+--------+---------+-----------+
|category|   status|order_count|
+--------+---------+-----------+
|  Burger|completed|          2|
| Chicken|completed|          3|
|  Burger| canceled|          1|
|   Pizza|completed|          3|
| Chicken| canceled|          1|
+--------+---------+-----------+



In [25]:
df.groupBy("category","status").agg(
    F.count("*").alias("order_count")
).orderBy("category","status").show()

+--------+---------+-----------+
|category|   status|order_count|
+--------+---------+-----------+
|  Burger| canceled|          1|
|  Burger|completed|          2|
| Chicken| canceled|          1|
| Chicken|completed|          3|
|   Pizza|completed|          3|
+--------+---------+-----------+



In [28]:
# 옛날 방식 딕셔너리방식
# alias를 붙일수 없다

In [27]:
df.groupBy("category").agg({"price":"sum","quantity":"avg"}).show()

+--------+------------------+----------+
|category|     avg(quantity)|sum(price)|
+--------+------------------+----------+
|  Burger|1.6666666666666667|     35000|
| Chicken|              1.75|     63000|
|   Pizza|               2.0|    132000|
+--------+------------------+----------+



In [29]:
spark.stop()