In [None]:
# entire code in single place
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, hour, to_timestamp, sum, avg, dayofweek

# Initialize Spark session
spark = SparkSession.builder.appName("Sales Data Analysis").getOrCreate()

# File location and type
file_location = "/Users/inbravo/Documents/GitHub/python-feature-set/dataset/super-market/sales-less-record.csv"
df = spark.read.option("header", True).option("inferSchema", True).csv(file_location)

# Data Cleaning
df = df.withColumn("Date", to_date(col("Date"), "M/d/yyyy"))
df = df.withColumn("Time", to_timestamp(col("Time"), "HH:mm"))
df = df.withColumn("Hour", hour(col("Time")))
df = df.withColumn("Day_of_Week", dayofweek(col("Date")))

# Total Sales Per Branch
df_branch_sales = df.groupBy("Branch").agg(sum("Total").alias("Total_Sales"))

# Average Basket Size (Avg Total per Transaction)
df_avg_basket = df.agg(avg("Total").alias("Avg_Basket_Size"))

# Total Spending Per Customer Type & Gender
df_customer_spend = df.groupBy("Customer type", "Gender").agg(sum("Total").alias("Total_Spending"))

# Preferred Payment Methods
df_payment_method = df.groupBy("Payment").agg(sum("Total").alias("Total_Spent"))

# Peak Sales Hours
df_peak_hours = df.groupBy("Hour").agg(sum("Total").alias("Total_Sales")).orderBy(col("Total_Sales").desc())

# Busiest Days of the Week
df_busy_days = df.groupBy("Day_of_Week").agg(sum("Total").alias("Total_Sales")).orderBy(col("Total_Sales").desc())

# Total Revenue & Profit per Branch
df_branch_financials = df.groupBy("Branch").agg(
    sum("Total").alias("Total_Revenue"),
    sum("gross income").alias("Total_Profit")
)

# Average Rating Per Product Line
df_avg_rating = df.groupBy("Product line").agg(avg("Rating").alias("Avg_Rating"))

# Most Frequently Purchased Products
df_top_products = df.groupBy("Product line").agg(sum("Quantity").alias("Total_Quantity")).orderBy(col("Total_Quantity").desc())

# Showing Results, We can do the same with display for visualization
df_branch_sales.show()
df_avg_basket.show()
df_customer_spend.show()
df_payment_method.show()
df_peak_hours.show()
df_busy_days.show()
df_branch_financials.show()
df_avg_rating.show()
df_top_products.show()

25/05/10 18:52:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Customer type` cannot be resolved. Did you mean one of the following? [`Customer_Type`, `City`, `Payment`, `Quantity`, `Unit_Price`].;
'Aggregate ['Customer type, Gender#21], ['Customer type, Gender#21, sum(Total#26) AS Total_Spending#192]
+- Project [Invoice_ID#17, Branch#18, City#19, Customer_Type#20, Gender#21, Product_Line#22, Unit_Price#23, Quantity#24, Tax_5%#25, Total#26, Date#51, Time#70, Payment#29, cogs#30, Gross_Margin_Percentage#31, Gross_Income#32, Rating#33, Hour#88, dayofweek(Date#51) AS Day_of_Week#107]
   +- Project [Invoice_ID#17, Branch#18, City#19, Customer_Type#20, Gender#21, Product_Line#22, Unit_Price#23, Quantity#24, Tax_5%#25, Total#26, Date#51, Time#70, Payment#29, cogs#30, Gross_Margin_Percentage#31, Gross_Income#32, Rating#33, hour(Time#70, Some(Europe/London)) AS Hour#88]
      +- Project [Invoice_ID#17, Branch#18, City#19, Customer_Type#20, Gender#21, Product_Line#22, Unit_Price#23, Quantity#24, Tax_5%#25, Total#26, Date#51, to_timestamp(Time#28, Some(HH:mm), TimestampType, Some(Europe/London), false) AS Time#70, Payment#29, cogs#30, Gross_Margin_Percentage#31, Gross_Income#32, Rating#33]
         +- Project [Invoice_ID#17, Branch#18, City#19, Customer_Type#20, Gender#21, Product_Line#22, Unit_Price#23, Quantity#24, Tax_5%#25, Total#26, to_date(Date#27, Some(M/d/yyyy), Some(Europe/London), false) AS Date#51, Time#28, Payment#29, cogs#30, Gross_Margin_Percentage#31, Gross_Income#32, Rating#33]
            +- Relation [Invoice_ID#17,Branch#18,City#19,Customer_Type#20,Gender#21,Product_Line#22,Unit_Price#23,Quantity#24,Tax_5%#25,Total#26,Date#27,Time#28,Payment#29,cogs#30,Gross_Margin_Percentage#31,Gross_Income#32,Rating#33] csv
