In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("agg").getOrCreate()

In [8]:
df = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("../data/retail-data/all/*.csv")
    .coalesce(5)
)
df.cache()


DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [9]:
df.count()

541909

In [10]:
from pyspark.sql.functions import *

In [12]:
df.select(count("StockCode").alias("count_stock_code")).show()

+----------------+
|count_stock_code|
+----------------+
|          541909|
+----------------+



In [18]:
df.select(count_distinct("StockCode").alias("count_stock_code")).show()

+----------------+
|count_stock_code|
+----------------+
|            4070|
+----------------+



In [19]:
df.select(approx_count_distinct("StockCode", 0.03).alias("count_stock_code")).show()

+----------------+
|count_stock_code|
+----------------+
|            4068|
+----------------+



In [21]:
df.select(first("StockCode"), last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



In [22]:
df.select(min("Quantity"), max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [23]:
df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [25]:
df.select(sum_distinct("Quantity")).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [28]:
df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"),
).selectExpr(
    "(total_purchases/total_transactions) as ratio_purchases_transactions",
    "avg_purchases",
    "mean_purchases",
).show()

+----------------------------+----------------+----------------+
|ratio_purchases_transactions|   avg_purchases|  mean_purchases|
+----------------------------+----------------+----------------+
|            9.55224954743324|9.55224954743324|9.55224954743324|
+----------------------------+----------------+----------------+



In [29]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [33]:
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+

