In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
df=spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../../data/SparkTheDefinitiveGuide/retail-data/all/*.csv")\
.coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [3]:
#  df.count()  : 541909
df.count() == 541909  # count is a method , more specificly here an action

True

In [4]:
from pyspark.sql.functions import count        # a function ( transformation)
df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [5]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [6]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode", 0.1)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [13]:
from pyspark.sql.functions import expr,first, last,min, max,sum,avg,sumDistinct
df.select(first("StockCode"), last("StockCode")).show()

+-----------------------+----------------------+
|first(StockCode, false)|last(StockCode, false)|
+-----------------------+----------------------+
|                 85123A|                 22138|
+-----------------------+----------------------+



In [None]:
df.select(min("Quantity"), max("Quantity")).show()

In [None]:
df.select(sum("Quantity")).show() # 5176450

In [None]:
df.select(sumDistinct("Quantity")).show() # 29310

In [15]:
#Variance and Standard Deviation for population and sample 
from pyspark.sql.functions import var_pop, stddev_pop,var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
stddev_pop("Quantity"), stddev_samp("Quantity")).show()

+-----------------+------------------+--------------------+---------------------+
|var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+-----------------+------------------+--------------------+---------------------+
|47559.30364660923| 47559.39140929892|  218.08095663447835|   218.08115785023455|
+-----------------+------------------+--------------------+---------------------+



In [16]:
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610528376|119768.05495530753|
+--------------------+------------------+



In [17]:
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085640497E-4|             1052.7280543915997|            1052.7260778754955|
+-------------------------+-------------------------------+------------------------------+



In [18]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



##### Group By

In [22]:
df.groupBy("InvoiceNo", "CustomerId").count().show(2)

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
+---------+----------+-----+
only showing top 2 rows



In [24]:
from pyspark.sql.functions import count
df.groupBy("InvoiceNo").agg(count("Quantity").alias("quan"),expr("count(Quantity)")).show(2)

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
+---------+----+---------------+
only showing top 2 rows



In [26]:
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)")).show(2)

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
+---------+------------------+--------------------+
only showing top 2 rows



#### Window Functions

In [27]:
from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [28]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
.partitionBy("CustomerId", "date")\
.orderBy(desc("Quantity"))\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [29]:
from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [30]:
from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [32]:
from pyspark.sql.functions import col
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
.select(
col("CustomerId"),
col("date"),
col("Quantity"),
purchaseRank.alias("quantityRank"),
purchaseDenseRank.alias("quantityDenseRank"),
maxPurchaseQuantity.alias("maxPurchaseQuantity")).show(5)

+----------+----+--------+------------+-----------------+-------------------+
|CustomerId|date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----+--------+------------+-----------------+-------------------+
|     12346|null|   74215|           1|                1|              74215|
|     12346|null|  -74215|           2|                2|              74215|
|     12347|null|     240|           1|                1|                240|
|     12347|null|      36|           2|                2|                240|
|     12347|null|      36|           2|                2|                240|
+----------+----+--------+------------+-----------------+-------------------+
only showing top 5 rows



####  Grouping Sets

In [73]:
dfNoNull = dfWithDate.na.drop(subset=["date"]) #dfWithDate.drop()  # Removing nulls 
dfNoNull.createOrReplaceTempView("dfNoNull")

In [74]:
# Grouping set only avaiable in SQL
# Normal way in SQL
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull \
          GROUP BY customerId, stockCode ORDER BY CustomerId DESC, stockCode DESC").show(2)

+----------+---------+-------------+
|CustomerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85039B|           48|
+----------+---------+-------------+
only showing top 2 rows



In [75]:
# With grouping set, it helps when complex/ multiple level grouping is needed.
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull GROUP BY customerId, stockCode \
          GROUPING SETS((customerId, stockCode)) ORDER BY CustomerId DESC, stockCode DESC").show(2)

+----------+---------+-------------+
|customerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85039B|           48|
+----------+---------+-------------+
only showing top 2 rows



In [76]:
# can add more sets 
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull GROUP BY customerId, stockCode \
GROUPING SETS((customerId, stockCode),()) ORDER BY CustomerId DESC, stockCode DESC ").show(2)

+----------+---------+-------------+
|customerId|stockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85039B|           48|
+----------+---------+-------------+
only showing top 2 rows



####  Rollups

In [88]:
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))\
.selectExpr("date", "Country", "`sum(Quantity)` as total_quantity").orderBy("Date")
rolledUpDF.show(5)

+----------+---------+--------------+
|      date|  Country|total_quantity|
+----------+---------+--------------+
|      null|     null|       1879379|
|2010-12-01|Australia|           107|
|2010-12-01|     null|         26814|
|2010-12-01|  Germany|           117|
|2010-12-01|   Norway|          1852|
+----------+---------+--------------+
only showing top 5 rows



#### Cube

In [87]:
#The total across all dates and countries
#The total for each date across all countries
#The total for each country on each date
#The total for each country across all dates
from pyspark.sql.functions import sum
dfNoNull.cube("Date", "Country").agg(sum(col("Quantity"))).select("Date", "Country", "sum(Quantity)").orderBy("Date").show(5)

+----+--------------+-------------+
|Date|       Country|sum(Quantity)|
+----+--------------+-------------+
|null|           USA|          897|
|null|         Spain|         7906|
|null|       Denmark|         2995|
|null|        Norway|        11001|
|null|Czech Republic|          285|
+----+--------------+-------------+
only showing top 5 rows



#### Grouping Metadata

Grouping ID Description

3    This will appear for the highest-level aggregation, which will gives us the total quantity regardless of customerId
and stockCode.

2      This will appear for all aggregations of individual stock codes. This gives us the total quantity per stock code,
regardless of customer.

1      This will give us the total quantity on a per-customer basis, regardless of item purchased.

0      This will give us the total quantity for individual customerId and stockCode combinations.