In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

In [2]:
spark = SparkSession.builder.appName("TotalSpentByCustomer").master("local[2]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/01 19:58:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# create schema when reading customer-orders
customerOrderSchema = StructType([\
                                StructField("cust_id", IntegerType(), True),
                                StructField("item_id", IntegerType(), True),
                                StructField("amount_spent", FloatType(), True),
                                ])

In [4]:
# loading up the data into spark dataset
customerDF = spark.read.schema(customerOrderSchema).csv("resources/customer-orders.csv")

In [5]:
customerDF.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-------+-------+------------+
|cust_id|item_id|amount_spent|
+-------+-------+------------+
|     44|   8602|       37.19|
|     35|   5368|       65.89|
|      2|   3391|       40.64|
|     47|   6694|       14.98|
|     29|    680|       13.08|
+-------+-------+------------+
only showing top 5 rows



                                                                                

In [6]:
totalByCustomer = customerDF.groupBy("cust_id")\
                                    .agg(func.round(func.sum("amount_spent"), 2)\
                                        .alias("total_spent"))

In [7]:
totalByCustomerSorted = totalByCustomer.sort("total_spent")

In [10]:
totalByCustomerSorted.show(totalByCustomerSorted.count())

+-------+-----------+
|cust_id|total_spent|
+-------+-----------+
|     45|    3309.38|
|     79|    3790.57|
|     96|    3924.23|
|     23|    4042.65|
|     99|    4172.29|
|     75|     4178.5|
|     36|    4278.05|
|     98|    4297.26|
|     47|     4316.3|
|     77|    4327.73|
|     13|    4367.62|
|     48|    4384.33|
|     49|     4394.6|
|     94|    4475.57|
|     67|    4505.79|
|     50|    4517.27|
|     78|    4524.51|
|      5|    4561.07|
|     57|     4628.4|
|     83|     4635.8|
|     91|    4642.26|
|     74|    4647.13|
|     84|    4652.94|
|      3|    4659.63|
|     12|    4664.59|
|     66|    4681.92|
|     56|    4701.02|
|     21|    4707.41|
|     80|    4727.86|
|     14|    4735.03|
|     37|     4735.2|
|      7|    4755.07|
|     44|    4756.89|
|     31|    4765.05|
|     82|    4812.49|
|      4|    4815.05|
|     10|     4819.7|
|     88|    4830.55|
|     20|    4836.86|
|     89|    4851.48|
|     95|    4876.84|
|     38|    4898.46|
|     76| 