# Demo of aggragating data in Pyspark
## Gold Layer Medallion Architecture

In [0]:
from pyspark.sql.functions import *

In [0]:
path = "dbfs:/FileStore/sample_data/dirty_cafe_sales.csv"   # Since I am just using the flat file, I will remove all errors and null values just for demonstrative purposes. 

cafe_df = (spark.read
           .format("csv")
           .option("header","true")
           .option("inferschema","true")
           .load(path)
)

In [0]:
error_values = ['ERROR', "UNKNOWN"]

cafe_df = cafe_df \
            .select([when(col(c).isin(error_values), None).otherwise(col(c)).alias(c) for c in cafe_df.columns])  # Standardize missing values so everything is a null value)


cafe_df = cafe_df.dropna(how="any")

In [0]:
cafe_df = cafe_df \
    .withColumn("Price Per Unit", col("Price Per Unit").cast("double")) \
    .withColumn("Total Spent", col("Total Spent").cast("double")) \
        .withColumn("Quantity", col("Quantity").cast("double")) \
    .withColumn("Transaction Date", to_date(col("Transaction Date"), "yyyy-MM-dd"))


### Aggregations

In [0]:
total_spent_item = cafe_df \
    .groupBy('Item') \
    .agg(sum(col('Total Spent')).alias('Total Spent')) \
    .orderBy('Total Spent', ascending=False)


total_spent_item.display()

Item,Total Spent
Salad,6360.0
Sandwich,4772.0
Smoothie,4088.0
Juice,3735.0
Cake,3519.0
Coffee,2242.0
Tea,1726.5
Cookie,1163.0


In [0]:
sum_avg_spent_item = cafe_df \
    .groupBy('Item') \
    .agg(
        sum('Total Spent').alias('Total Spent'), 
        avg('Total Spent').alias('Average Spent')
    ) \
    .orderBy('Total Spent', ascending=True)


sum_avg_spent_item.display()


Item,Total Spent,Average Spent
Cookie,1163.0,2.9744245524296677
Tea,1726.5,4.641129032258065
Coffee,2242.0,6.108991825613079
Cake,3519.0,9.14025974025974
Juice,3735.0,8.747072599531617
Smoothie,4088.0,12.094674556213018
Sandwich,4772.0,12.20460358056266
Salad,6360.0,15.21531100478469


In [0]:
total_spent_item_payment = cafe_df \
    .groupBy('Item', 'Payment Method') \
    .agg(sum('Total Spent').alias('Total Spent')) \
    .orderBy(['Item','Total Spent'], ascending =[True,False])


total_spent_item_payment.display()

Item,Payment Method,Total Spent
Cake,Cash,1206.0
Cake,Credit Card,1197.0
Cake,Digital Wallet,1116.0
Coffee,Digital Wallet,812.0
Coffee,Cash,770.0
Coffee,Credit Card,660.0
Cookie,Cash,405.0
Cookie,Credit Card,380.0
Cookie,Digital Wallet,378.0
Juice,Digital Wallet,1428.0


In [0]:
 # This transformation should be done in the silver layer
cafe_df = cafe_df.withColumn("Transaction Year", year(cafe_df["Transaction Date"])) \
                 .withColumn("Transaction Quarter", quarter(cafe_df["Transaction Date"]))

In [0]:
 # This transformation should be done in the silver layer
cafe_df = cafe_df.withColumn("Year and Quarter", concat(cafe_df["Transaction Year"], lit(" Q"), cafe_df["Transaction Quarter"]))


display(cafe_df.head(10))

Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date,Transaction Year,Transaction Quarter,Year and Quarter
TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08,2023,3,2023 Q3
TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16,2023,2,2023 Q2
TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11,2023,2,2023 Q2
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,2023-11-07,2023,4,2023 Q4
TXN_7619095,Sandwich,2.0,4.0,8.0,Cash,In-store,2023-05-03,2023,2,2023 Q2
TXN_2847255,Salad,3.0,5.0,15.0,Credit Card,In-store,2023-11-15,2023,4,2023 Q4
TXN_6769710,Juice,2.0,3.0,6.0,Cash,In-store,2023-02-24,2023,1,2023 Q1
TXN_3709394,Juice,4.0,3.0,12.0,Cash,Takeaway,2023-01-15,2023,1,2023 Q1
TXN_3567645,Smoothie,4.0,4.0,16.0,Credit Card,Takeaway,2023-03-30,2023,1,2023 Q1
TXN_5132361,Sandwich,3.0,4.0,12.0,Digital Wallet,Takeaway,2023-12-01,2023,4,2023 Q4


In [0]:
year_quarter_item_spent = cafe_df \
    .groupBy('Year and Quarter', 'Location') \
    .agg(
        sum('Total Spent').alias('Total Spent'),
        max('Total Spent').alias('Max Spent'),
        round(avg('Total Spent'),2).alias('Average Spent')
    ) \
    .orderBy(['Year and Quarter', 'Total Spent'], ascending = [True,False])

year_quarter_item_spent.display()

Year and Quarter,Location,Total Spent,Max Spent,Average Spent
2023 Q1,In-store,3752.0,25.0,9.29
2023 Q1,Takeaway,3456.5,25.0,8.77
2023 Q2,In-store,3348.0,25.0,9.15
2023 Q2,Takeaway,3053.5,25.0,8.8
2023 Q3,Takeaway,3565.0,25.0,8.45
2023 Q3,In-store,3497.0,25.0,9.28
2023 Q4,In-store,3513.5,25.0,8.89
2023 Q4,Takeaway,3420.0,25.0,8.91
