##### Usecase:Get sales data by orderstatus
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for each orderstatus
###### Problem statement-2:Get sales data for each orderstatus perday
###### Problem statement-3:Get total orders for each order status permonth
###### Problem statement-4:Get total orders for each order status peryear
###### Problem statement-5:Get monthly sales data for each order status for first quarter of 2014
###### Problem statement-6:Get monthly sales data for last quarter of 2013 with neither 'COMPLETE' nor 'CLOSED' orders
###### Problem statement-7:Get monthly sales data for last quarter of 2013 and first quarter of 2014  with 'CANCELED' orders
###### Problem statement-8:Get monthly sales data for 'PENDING' orders
###### Problem statement-9:Get monthly sales data for 'PROCESSING' OR ON_HOLD orders
###### Problem statement-10:Get monthly sales data with  'COMPLETE' AND 'CLOSED' orders

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,round,avg,rank, col
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp, lit
spark = SparkSession.builder.master('local').appName('SalesDataForeachorderstatus').enableHiveSupport().getOrCreate()

In [3]:
ordersdf = spark.read.csv("/FileStore/tables/retaildbtext/Orders.txt", sep=',',schema ='order_id int,order_date string,order_customer_id int, order_status string')
ordersdf.show(5,False)

In [4]:
orderItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Order_Items.txt", sep=',', schema='order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float')
orderItemsdf.show(5,False)

In [5]:
ordersMap = ordersdf.select('order_id','order_date','order_status')
ordersMap.show(5,False)

In [6]:
orderItemsMap=orderItemsdf.select('order_item_order_id','order_item_subtotal','order_item_quantity')
orderItemsMap.show(5,False)

In [7]:
ordersdfjoin = ordersMap.join(orderItemsMap, ordersMap.order_id == orderItemsMap.order_item_order_id,'inner')
ordersdfjoin.show(5,False)

In [8]:
ordersdfSalesmap = ordersdfjoin.select('order_id','order_date','order_status','order_item_subtotal','order_item_quantity')
ordersdfSalesmap.show(5,False)

##### Problem statement-1:Get sales data for each orderstatus

In [10]:
SalesDataPerorderPerstatus = ordersdfSalesmap.groupBy('order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).sort('order_status')
SalesDataPerorderPerstatus.coalesce(1).write.csv("/FileStore/tables/SalesDataPerorderPerstatus", compression="none", header ='true')
SalesDataPerorderPerstatus.show(20,False)

##### Problem statement-2:Get sales data for each orderstatus perday

In [12]:
SalesDataPerstatusPerday = ordersdfSalesmap.groupBy(substring('order_date',0,10).alias("orderdate"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perday")).sort(substring('order_date',0,10).alias("orderdate"),'order_status')
SalesDataPerstatusPerday.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPerday", compression="none", header ='true')
SalesDataPerstatusPerday.show(20,False)

##### Problem statement-3:Get total orders for each order status permonth

In [14]:
SalesDataPerstatusPermonth = ordersdfSalesmap.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth")).sort(substring('order_date',0,7).alias("ordermonth"),'order_status')
SalesDataPerstatusPermonth.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPermonth", compression="none", header ='true')
SalesDataPerstatusPermonth.show(20,False)

##### Problem statement-4:Get total orders for each order status peryear

In [16]:
SalesDataPerstatusPeryear = ordersdfSalesmap.groupBy(substring('order_date',0,4).alias("orderyear"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_peryear")).sort(substring('order_date',0,4).alias("orderyear"),'order_status')
SalesDataPerstatusPeryear.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPeryear", compression="none", header ='true')
SalesDataPerstatusPeryear.show(20,False)

##### Problem statement-5:Get monthly sales data for each order status for first quarter of 2014

In [18]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfSalesmap.order_date) >=lit("2014-01-01")) & 
                                          (to_date(ordersdfSalesmap.order_date) <=lit("2014-03-31")))
SalesDataPerstatusPeryear = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth")).sort(substring('order_date',0,7).alias("ordermonth"),'order_status')
SalesDataPerstatusPeryear.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPeryear", compression="none", header ='true')
SalesDataPerstatusPeryear.show(20,False)

##### Problem statement-6:Get monthly sales data for last quarter of 2013 with neither 'COMPLETE' nor 'CLOSED' orders

In [20]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfSalesmap.order_date) >=lit("2013-10-01")) & 
                                          (to_date(ordersdfSalesmap.order_date) <=lit("2013-12-31")))

SalesDataPerstatusPermonth = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter((SalesDataPerMonth.order_status!='COMPLETE') & (SalesDataPerMonth.order_status!='CLOSED')).\
                                                                sort('ordermonth','order_status')
SalesDataPerstatusPermonth.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPermonth", compression="none", header ='true')
SalesDataPerstatusPermonth.show(20,False)

###### Problem statement-7:Get monthly sales data for last quarter of 2013 and first quarter of 2014  with 'CANCELED' orders

In [22]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfSalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(ordersdfSalesmap.order_date) <=lit("2013-12-31")) |
                                                 (to_date(ordersdfSalesmap.order_date) >=lit("2014-01-01")) & 
                                                 (to_date(ordersdfSalesmap.order_date) <=lit("2014-03-31")))
SalesDatacanceledordersquarter = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter(SalesDataPerMonth.order_status =='CANCELED').\
                                                                sort('ordermonth','order_status')
SalesDatacanceledordersquarter.coalesce(1).write.csv("/FileStore/tables/SalesDatacanceledordersquarter", compression="none", header ='true')
SalesDatacanceledordersquarter.show(30,False)

###### Problem statement-8:Get monthly sales data for 'PENDING' orders

In [24]:
SalesDatapendingorder = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter(SalesDataPerMonth.order_status =='PENDING').\
                                                                sort('ordermonth','order_status')
SalesDatapendingorder.coalesce(1).write.csv("/FileStore/tables/SalesDatapendingorder", compression="none", header ='true')
SalesDatapendingorder.show(30,False)

###### Problem statement-9:Get monthly sales data for 'PROCESSING' or ON_HOLD orders

In [26]:
SalesDatacanceledordersquarter = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter((SalesDataPerMonth.order_status=='PROCESSING') | (SalesDataPerMonth.order_status=='ON_HOLD')).\
                                                                sort('ordermonth','order_status')
SalesDataprocessingcanceledorders.coalesce(1).write.csv("/FileStore/tables/SalesDataprocessingcanceledorders", compression="none", header ='true')
SalesDatacanceledordersquarter.show(30,False)

###### Problem statement-10:Get monthly sales data with  'COMPLETE' AND 'CLOSED' orders

In [28]:
SalesDatacompleteandclosedorders = SalesDataPerMonth.groupBy(substring('order_date',0,7).alias("ordermonth"),'order_status').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter((SalesDataPerMonth.order_status=='COMPLETE') | (SalesDataPerMonth.order_status=='CLOSED')).\
                                                                sort('ordermonth','order_status')
SalesDatacompleteandclosedorders.coalesce(1).write.csv("/FileStore/tables/SalesDatacompleteandclosedorders", compression="none", header ='true')
SalesDatacompleteandclosedorders.show(30,False)