In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [2]:
spark = SparkSession.builder \
    .appName('Six Spark Exercises to Rule Them All') \
    .getOrCreate()

23/01/16 16:08:16 WARN Utils: Your hostname, karlos-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 10.0.0.89 instead (on interface wlp2s0)
23/01/16 16:08:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/16 16:08:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Import dataset

In [40]:
products = spark.read.parquet('./data/products_parquet/') \
    .withColumn('product_id', F.col('product_id').cast('int')) \
    .withColumn('price', F.col('price').cast('INT'))

products.createOrReplaceTempView('products')

print(products)
products.show(truncate=False)

DataFrame[product_id: int, product_name: string, price: int]
+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|0         |product_0   |22   |
|1         |product_1   |30   |
|2         |product_2   |91   |
|3         |product_3   |37   |
|4         |product_4   |145  |
|5         |product_5   |128  |
|6         |product_6   |66   |
|7         |product_7   |145  |
|8         |product_8   |51   |
|9         |product_9   |44   |
|10        |product_10  |53   |
|11        |product_11  |13   |
|12        |product_12  |104  |
|13        |product_13  |102  |
|14        |product_14  |24   |
|15        |product_15  |14   |
|16        |product_16  |38   |
|17        |product_17  |72   |
|18        |product_18  |16   |
|19        |product_19  |46   |
+----------+------------+-----+
only showing top 20 rows



In [57]:
sales = spark.read.parquet('./data/sales_parquet') \
    .withColumn('order_id', F.col('order_id').cast('int')) \
    .withColumn('product_id', F.col('product_id').cast('int')) \
    .withColumn('seller_id', F.col('seller_id').cast('int')) \
    .withColumn('date', F.col('date').cast('date')) \
    .withColumn('num_pieces_sold', F.col('num_pieces_sold').cast('int')) \

sales.createOrReplaceTempView('sales')

print(sales)
sales.show()

DataFrame[order_id: int, product_id: int, seller_id: int, date: date, num_pieces_sold: int, bill_raw_text: string]
+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             26|kyeibuumwlyhuwksx...|
|       2|         0|        0|2020-07-08|             13|jfyuoyfkeyqkckwbu...|
|       3|         0|        0|2020-07-05|             38|uyjihlzhzcswxcccx...|
|       4|         0|        0|2020-07-05|             56|umnxvoqbdzpbwjqmz...|
|       5|         0|        0|2020-07-05|             11|zmqexmaawmvdpqhih...|
|       6|         0|        0|2020-07-01|             82|lmuhhkpyuoyslwmvX...|
|       7|         0|        0|2020-07-04|             15|zoqweontumefxbgvu...|
|       8|         0|        0|2020-07-08|             79|sgldfgtcxufasnvsc...|
|    

In [39]:
sellers = spark.read.parquet('./data/sellers_parquet') \
    .withColumn('seller_id', F.col('seller_id').cast('int')) \
    .withColumn('daily_target', F.col('daily_target').cast('int'))

sellers.createOrReplaceTempView('sellers')

print(sellers)
sellers.show()

DataFrame[seller_id: int, seller_name: string, daily_target: int]
+---------+-----------+------------+
|seller_id|seller_name|daily_target|
+---------+-----------+------------+
|        0|   seller_0|     2500000|
|        1|   seller_1|      257237|
|        2|   seller_2|      754188|
|        3|   seller_3|      310462|
|        4|   seller_4|     1532808|
|        5|   seller_5|     1199693|
|        6|   seller_6|     1055915|
|        7|   seller_7|     1946998|
|        8|   seller_8|      547320|
|        9|   seller_9|     1318051|
+---------+-----------+------------+



### Warm-up exercises

#### Warm-up #1

Find out how many orders, how many products and how many sellers are in the data.

How many products have been sold at least once? Which is the product contained in more orders?

##### PySpark version

In [50]:
print(f'Number of products: {products.count():_}')
print(f'Number of sales: {sales.count():_}')
print(f'Number of sellers: {sellers.count():_}')

Number of products: 75_000_000


                                                                                

Number of sales: 20_000_040
Number of sellers: 10


In [62]:
sales.selectExpr('COUNT(DISTINCT product_id) AS products_sold').show()



+-------------+
|products_sold|
+-------------+
|       993429|
+-------------+



                                                                                

In [61]:
# Alternative version

sales.select(F.count_distinct('product_id').alias('products_sold')).show()



+-------------+
|products_sold|
+-------------+
|       993429|
+-------------+



                                                                                

In [75]:
sales.groupBy('product_id') \
    .agg(F.count('*').alias('number_of_sales')) \
    .orderBy('number_of_sales', ascending=False) \
    .limit(1) \
    .show()



+----------+---------------+
|product_id|number_of_sales|
+----------+---------------+
|         0|       19000000|
+----------+---------------+



                                                                                

##### Spark SQL version

In [None]:
spark.sql(
    """
    SELECT COUNT(*) AS number_of_products
    FROM products
    """
).show()

In [None]:
spark.sql(
    """
    SELECT COUNT(*) AS number_of_sales
    FROM sales
    """
).show()

In [None]:
spark.sql(
    """
    SELECT COUNT(*) AS number_of_sellers
    FROM sellers
    """
).show()

In [59]:
spark.sql(
    """
    SELECT COUNT(DISTINCT sales.product_id) AS products_sold
    FROM sales
    """
).show()



+-------------+
|products_sold|
+-------------+
|       993429|
+-------------+



                                                                                

In [76]:
spark.sql(
    """
    SELECT sales.product_id, COUNT(*) number_of_sales
    FROM sales
    GROUP BY sales.product_id
    ORDER BY number_of_sales DESC
    LIMIT 1
    """
).show(truncate=False)



+----------+---------------+
|product_id|number_of_sales|
+----------+---------------+
|0         |19000000       |
+----------+---------------+



                                                                                

#### Warm-up #2

How many distinct products have been sold in each day?

##### PySpark version

In [78]:
sales.groupBy('date') \
    .agg(F.count('*').alias('number_of_distinct_products_sold')) \
    .orderBy('date') \
    .show()



+----------+--------------------------------+
|      date|number_of_distinct_products_sold|
+----------+--------------------------------+
|2020-07-01|                         2001370|
|2020-07-02|                         1998985|
|2020-07-03|                         2000651|
|2020-07-04|                         2000949|
|2020-07-05|                         1996618|
|2020-07-06|                         2001302|
|2020-07-07|                         2000807|
|2020-07-08|                         2000451|
|2020-07-09|                         2000402|
|2020-07-10|                         1998505|
+----------+--------------------------------+



                                                                                

##### Spark SQL version

In [79]:
spark.sql(
    """
    SELECT sales.date, COUNT(*) as number_of_distinct_products_sold
    FROM sales
    GROUP BY date
    ORDER BY date
    """
).show()



+----------+--------------------------------+
|      date|number_of_distinct_products_sold|
+----------+--------------------------------+
|2020-07-01|                         2001370|
|2020-07-02|                         1998985|
|2020-07-03|                         2000651|
|2020-07-04|                         2000949|
|2020-07-05|                         1996618|
|2020-07-06|                         2001302|
|2020-07-07|                         2000807|
|2020-07-08|                         2000451|
|2020-07-09|                         2000402|
|2020-07-10|                         1998505|
+----------+--------------------------------+



                                                                                

### Exercises

#### Exercise #1

What is the average revenue of the orders?

##### PySpark version

In [None]:
sales.select() \
    .show()

##### Spark SQL version