In [0]:
# 구매 이력 데이터
# 적은 수로 분할할 수 있도록 리파티셔닝 후, 캐시

df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('/databricks-datasets/definitive-guide/data/retail-data/all/*.csv')\
    .coalesce(5)

df.cache()
df.createOrReplaceTempView("dfTable")

In [0]:
# count는 트랜스포메이션이 아닌 액션
# count는 크기 이외에도 메모리에 DataFrame 캐싱 작업을 수행하는 용도로도 쓰임
df.count() == 541909

Out[2]: True

In [0]:
# 지연 연산으로 사용하는 count
# count(*)은 null값을 포함하여 카운트함
# count(column)은 null을 카운트하지 않음
from pyspark.sql.functions import count

df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [0]:
# countDistinct : 고유 레코드 수를 확인하기
from pyspark.sql.functions import countDistinct

df.select(countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [0]:
# approx_count_distinct : 고유 레코드 수의 근사치 구하기
# 최대 추정 오류율 파라미터가 들어가야 함

from pyspark.sql.functions import approx_count_distinct

df.select(approx_count_distinct("StockCode", 0.1)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [0]:
# first, last : DataFrame의 첫번째, 마지막값 출력

from pyspark.sql.functions import first, last

df.select(first("StockCode"), last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



In [0]:
# min, max

from pyspark.sql.functions import min, max

df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [0]:
# sum

from pyspark.sql.functions import sum

df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [0]:
# sumDistinct

from pyspark.sql.functions import sumDistinct

df.select(sumDistinct("Quantity")).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [0]:
# avg, mean
from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("count"),
    sum("Quantity").alias("sum"),
    avg("Quantity").alias("avg"),
    expr("mean(Quantity)").alias("mean")
).selectExpr(
    "sum/count",
    "avg",
    "mean"
).show()

+----------------+----------------+----------------+
|   (sum / count)|             avg|            mean|
+----------------+----------------+----------------+
|9.55224954743324|9.55224954743324|9.55224954743324|
+----------------+----------------+----------------+

