In [9]:
import glob
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder\
                .appName("SparkLearning_Charper_5")\
                .getOrCreate()
                    
# sc = SparkContext()

In [6]:
df = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferSchema", "true")\
                .load("Spark-The-Definitive-Guide/data/retail-data/by-day/*.csv")\
                .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

In [10]:
# // count inside a expression
df.select(count("StockCode")).collect()

[Row(count(StockCode)=541909)]

In [11]:
df.select(
count("Quantity").alias("total_transactions"),
sum("Quantity").alias("total_purchases"),
avg("Quantity").alias("avg_purchases"),
expr("mean(Quantity)").alias("mean_purchases"))\
.selectExpr(
"total_purchases/total_transactions",
"avg_purchases",
"mean_purchases")\
.collect()

[Row((total_purchases / total_transactions)=9.55224954743324, avg_purchases=9.55224954743324, mean_purchases=9.55224954743324)]

In [12]:
# Variance and Standard deviation
df.select(
var_pop("Quantity"),
var_samp("Quantity"),
stddev_pop("Quantity"),
stddev_samp("Quantity"))\
.collect()

[Row(var_pop(Quantity)=47559.30364660885, var_samp(Quantity)=47559.39140929855, stddev_pop(Quantity)=218.0809566344775, stddev_samp(Quantity)=218.0811578502337)]

In [13]:
df.select(
skewness("Quantity"),
kurtosis("Quantity"))\
.collect()

[Row(skewness(Quantity)=-0.264075576105286, kurtosis(Quantity)=119768.05495534562)]

In [14]:
df.select(
corr("InvoiceNo", "Quantity"),
covar_samp("InvoiceNo", "Quantity"),
covar_pop("InvoiceNo", "Quantity"))\
.show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085617365E-4|             1052.7280543863135|            1052.7260778702093|
+-------------------------+-------------------------------+------------------------------+



## Aggregation to complex data types

In [15]:
df.agg(
collect_set("Country"),
collect_list("Country"))\
.show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [16]:
df.groupBy("invoiceNo").count().show()

+---------+-----+
|invoiceNo|count|
+---------+-----+
|   574966|    8|
|   575091|   38|
|   578057|   28|
|   537252|    1|
|   578459|    8|
|  C578132|    1|
|   578292|   72|
|   576112|   20|
|   577022|   38|
|   574592|    8|
|  C576393|    2|
|   577511|   46|
|   577541|   21|
|   580739|    2|
|   580906|    4|
|   573726|    1|
|   575671|   20|
|   570264|    1|
|   570281|    3|
|   569823|   69|
+---------+-----+
only showing top 20 rows



In [17]:
df.groupBy("InvoiceNo")\
.agg(
count("Quantity").alias("quan"),
expr("count(Quantity)"))\
.show()

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   574966|   8|              8|
|   575091|  38|             38|
|   578057|  28|             28|
|   537252|   1|              1|
|   578459|   8|              8|
|  C578132|   1|              1|
|   578292|  72|             72|
|   576112|  20|             20|
|   577022|  38|             38|
|   574592|   8|              8|
|  C576393|   2|              2|
|   577511|  46|             46|
|   577541|  21|             21|
|   580739|   2|              2|
|   580906|   4|              4|
|   573726|   1|              1|
|   575671|  20|             20|
|   570264|   1|              1|
|   570281|   3|              3|
|   569823|  69|             69|
+---------+----+---------------+
only showing top 20 rows



In [19]:
df.groupBy("InvoiceNo")\
.agg(
"Quantity" ->"avg",
"Quantity" -> "stddev_pop")\
.show()

SyntaxError: invalid syntax (<ipython-input-19-a3c07ad708d5>, line 2)

In [29]:
dfWithDate = df.withColumn("date", col("InvoiceDate").cast("date"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [30]:
from pyspark.sql.window import Window
# from pyspark.sql.functions import desc
windowSpec = Window\
.partitionBy("CustomerId", "date")\
.orderBy(desc("Quantity"))\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [31]:
maxPurchaseQuantity = max(col("Quantity"))\
.over(windowSpec)


In [32]:
purchaseDenseRank = dense_rank()\
.over(windowSpec)
purchaseRank = rank()\
.over(windowSpec)

In [33]:
dfWithDate\
.where("CustomerId IS NOT NULL")\
.orderBy("CustomerId")\
.select(
col("CustomerId"),
col("date"),
col("Quantity"),
purchaseRank.alias("quantityRank"),
purchaseDenseRank.alias("quantityDenseRank"),
maxPurchaseQuantity.alias("maxPurchaseQuantity"))\
.show()

+----------+----------+--------+------------+-----------------+-------------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|maxPurchaseQuantity|
+----------+----------+--------+------------+-----------------+-------------------+
|   12346.0|2011-01-18|   74215|           1|                1|              74215|
|   12346.0|2011-01-18|  -74215|           2|                2|              74215|
|   12347.0|2010-12-07|      36|           1|                1|                 36|
|   12347.0|2010-12-07|      30|           2|                2|                 36|
|   12347.0|2010-12-07|      24|           3|                3|                 36|
|   12347.0|2010-12-07|      12|           4|                4|                 36|
|   12347.0|2010-12-07|      12|           4|                4|                 36|
|   12347.0|2010-12-07|      12|           4|                4|                 36|
|   12347.0|2010-12-07|      12|           4|                4|             

In [38]:
# rollups
rolledUpDF = dfWithDate.rollup("Date", "Country")\
.agg(sum("Quantity"))\
.selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
.orderBy("Date")
rolledUpDF.show(20)

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|          EIRE|           243|
|2010-12-01|        France|           449|
|2010-12-01|     Australia|           107|
|2010-12-01|        Norway|          1852|
|2010-12-01|       Germany|           117|
|2010-12-01|          null|         26814|
|2010-12-01|   Netherlands|            97|
|2010-12-02|          EIRE|             4|
|2010-12-02|          null|         21023|
|2010-12-02|       Germany|           146|
|2010-12-02|United Kingdom|         20873|
|2010-12-03|        Poland|           140|
|2010-12-03|         Spain|           400|
|2010-12-03|       Germany|           170|
|2010-12-03|       Belgium|           528|
|2010-12-03|        France|           239|
|2010-12-03|      Portugal|            65|
|2010-12-03|         Italy|           164|
+----------

In [40]:
dfWithDate.cube("Date", "Country")\
.agg(sum(col("Quantity")))\
.selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")\
.orderBy("Date")\
.show(20)

+----+--------------------+--------------+
|Date|             Country|total_quantity|
+----+--------------------+--------------+
|null|               Japan|         25218|
|null|         Unspecified|          3300|
|null|           Australia|         83653|
|null|           Singapore|          5234|
|null|                 RSA|           352|
|null|             Germany|        117448|
|null|            Portugal|         16180|
|null|     Channel Islands|          9479|
|null|             Lebanon|           386|
|null|                null|       5176450|
|null|               Spain|         26824|
|null|           Hong Kong|          4769|
|null|              Cyprus|          6317|
|null|United Arab Emirates|           982|
|null|             Denmark|          8188|
|null|              Norway|         19247|
|null|  European Community|           497|
|null|      Czech Republic|           592|
|null|                 USA|          1034|
|null|             Finland|         10666|
+----+-----

In [36]:
pivoted = dfWithDate\
.groupBy("date")\
.pivot("Country")\
.agg({"quantity":"sum"})

In [37]:
pivoted.where("date > '2011-12-05'").select("USA").show()

+----+
| USA|
+----+
|null|
|null|
|-196|
|null|
+----+

