In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

In [2]:
# // Make Session first
spark = SparkSession.builder.appName("TotalSpentByCustomer").getOrCreate()

In [3]:
# // Create Schema
customerOrderSchema = StructType([ \
                     StructField("cust_id", IntegerType(), True), \
                     StructField("order_id", IntegerType(), True), \
                     StructField("amount_spent", FloatType(), True)])

In [4]:
# // Read the file as dataframe
customersDF = spark.read.schema(customerOrderSchema).csv("file:///c:/SparkCourse/Dataset/customer-orders.csv")
customersDF.printSchema()

root
 |-- cust_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- amount_spent: float (nullable = true)



In [5]:
# GROUP BY custid and sum(amount)
#customerTemps = df.select("custid", "amount")
#customerTemps.show()

totalByCustomer = customersDF.groupBy("cust_id").agg(func.round(func.sum("amount_spent"), 2) \
                                      .alias("total_spent"))

In [6]:
# Aggregate to find minimum temperature for every station
#totalByCustomer = customerTemps.groupBy("custid").agg(func.round(func.sum("amount"), 2).alias("total_spent"))
#totalByCustomer.show()

# Sorted by total_spent
totalByCustomerSorted = totalByCustomer.sort("total_spent")

In [7]:
# Show Sorted by total_spent
#totalByCustomerSorted = totalByCustomer.sort("total_spent").show(1000000)
totalByCustomerSorted.show(totalByCustomerSorted.count())

+-------+-----------+
|cust_id|total_spent|
+-------+-----------+
|     45|    3309.38|
|     79|    3790.57|
|     96|    3924.23|
|     23|    4042.65|
|     99|    4172.29|
|     75|     4178.5|
|     36|    4278.05|
|     98|    4297.26|
|     47|     4316.3|
|     77|    4327.73|
|     13|    4367.62|
|     48|    4384.33|
|     49|     4394.6|
|     94|    4475.57|
|     67|    4505.79|
|     50|    4517.27|
|     78|    4524.51|
|      5|    4561.07|
|     57|     4628.4|
|     83|     4635.8|
|     91|    4642.26|
|     74|    4647.13|
|     84|    4652.94|
|      3|    4659.63|
|     12|    4664.59|
|     66|    4681.92|
|     56|    4701.02|
|     21|    4707.41|
|     80|    4727.86|
|     14|    4735.03|
|     37|     4735.2|
|      7|    4755.07|
|     44|    4756.89|
|     31|    4765.05|
|     82|    4812.49|
|      4|    4815.05|
|     10|     4819.7|
|     88|    4830.55|
|     20|    4836.86|
|     89|    4851.48|
|     95|    4876.84|
|     38|    4898.46|
|     76| 

In [8]:
spark.stop()