# Showing how much each customer spent getting this information from their orders

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import sum as _sum
from pyspark.sql.functions import round as _round
from pyspark.sql.functions import desc

In [2]:
#Starting Spark Session
spark = SparkSession.builder.appName("totalSpend").master("local[*]").getOrCreate()

#Defining the schema
customerOrdersSchema = StructType([ \
  StructField("cust_id", StringType(),True), \
  StructField("item_id", IntegerType(), True), \
  StructField("amount_spent", DoubleType(), True) \
                                  ])

#Reading the file with the given schema
customerOrders = spark.read.option("header", "true").schema(customerOrdersSchema).csv("data/customer-orders.csv")

In [3]:
#Get how much each customer spent summing the amount spent grouping by customer
customerSpent = customerOrders.groupBy("cust_id").agg(_round(_sum("amount_spent"), 2).alias("amount_spent"))

In [4]:
customerSpent.orderBy(desc("amount_spent")).show(10)

+-------+------------+
|cust_id|amount_spent|
+-------+------------+
|     68|     6375.45|
|     73|      6206.2|
|     39|     6193.11|
|     54|     6065.39|
|     71|     5995.66|
|      2|     5994.59|
|     97|     5977.19|
|     46|     5963.11|
|     42|     5696.84|
|     59|     5642.89|
+-------+------------+
only showing top 10 rows



# Checking which itens were sold the most

In [7]:
#Counting how many times each item was sold
customerOrders.groupBy("item_id").count().orderBy(desc("count")).show(10)

+-------+-----+
|item_id|count|
+-------+-----+
|   1827|    7|
|   4809|    6|
|   3094|    6|
|   8395|    6|
|    994|    6|
|   9127|    6|
|    708|    6|
|   8868|    6|
|   6332|    6|
|   2828|    5|
+-------+-----+
only showing top 10 rows



In [42]:
spark.stop()