##### Usecase:Get sales data by orderstatus
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for each orderstatus
###### Problem statement-2:Get sales data for each orderstatus perday
###### Problem statement-3:Get total orders for each order status permonth
###### Problem statement-4:Get total orders for each order status peryear
###### Problem statement-5:Get monthly sales data for each order status for first quarter of 2014
###### Problem statement-6:Get monthly sales data for last quarter of 2013 with neither 'COMPLETE' nor 'CLOSED' orders
###### Problem statement-7:Get monthly sales data for last quarter of 2013 and first quarter of 2014  with 'CANCELED' orders
###### Problem statement-8:Get monthly sales data for 'PENDING' orders
###### Problem statement-9:Get monthly sales data for 'PROCESSING' OR ON_HOLD orders
###### Problem statement-10:Get monthly sales data with  'COMPLETE' AND 'CLOSED' orders

In [2]:
from pyspark.sql import SQLContext, Row, SparkSession
from pyspark.sql.functions import round,sum,avg,count,substring
from pyspark.sql.types import *
sqlContext = SQLContext(sc)
spark = SparkSession.builder.master('local').appName('getsalesdatabyorder').enableHiveSupport().getOrCreate()
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
ordersdf = spark.read.csv("/FileStore/tables/retaildbtext/Orders.txt", sep=',',schema ='order_id int,order_date string,order_customer_id int, order_status string')
ordersdf.registerTempTable("ordersTbl")
ordersdf.show(5,False)

In [4]:
orderItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Order_Items.txt", sep=',', schema='order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float')
orderItemsdf.registerTempTable("orderItemsTbl")
orderItemsdf.show(5,False)

In [5]:
sqlContext.sql("select * from ordersTbl").show(5,False)

In [6]:
sqlContext.sql("select * from orderItemsTbl").show(5,False)

##### Problem statement-1:Get sales data for each orderstatus

In [8]:
SalesDataPerorderPerstatus = sqlContext.sql('''select otl.order_status,round(sum(oitl.order_item_subtotal),2) as Totalrevenue_perstatus, 
                                                count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus 
                                                from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                                group by otl.order_status 
                                                order by otl.order_status''')
SalesDataPerorderPerstatus.coalesce(1).write.csv("/FileStore/tables/SalesDataPerorderPerstatus ", compression="none", header ='true')
SalesDataPerorderPerstatus.show(20,False)

##### Problem statement-2:Get sales data for each orderstatus perday

In [10]:
SalesDataPerstatusPerday = sqlContext.sql('''select substring(otl.order_date,0,10) as orderdate, otl.order_status, 
                                             round(sum(oitl.order_item_subtotal),2) as Totalrevenue_perstatus,
                                             count(1) as Numberoforder_perstatus, oitl.order_item_quantity,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                             from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                             group by orderdate,otl.order_status,oitl.order_item_quantity 
                                             order by orderdate,otl.order_status''')
SalesDataPerstatusPerday.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPerday", compression="none", header ='true')
SalesDataPerstatusPerday.show(20,False)

##### Problem statement-3:Get total orders for each order status permonth

In [12]:
SalesDataPerstatusPermonth = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales_perstatus,
                                               count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               group by ordermonth,otl.order_status 
                                               order by ordermonth,otl.order_status''')
SalesDataPerstatusPermonth.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPermonth", compression="none", header ='true')
SalesDataPerstatusPermonth.show(20,False)

##### Problem statement-4:Get total orders for each order status peryear

In [14]:
SalesDataPerstatusPeryear = sqlContext.sql('''select substring(otl.order_date,0,4) as orderyear, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales_perstatus, 
                                               count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               group by orderyear,otl.order_status
                                               order by orderyear,otl.order_status''')
SalesDataPerstatusPeryear.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusPeryear", compression="none", header ='true')
SalesDataPerstatusPeryear.show(20,False)

##### Problem statement-5:Get monthly sales data for each order status for first quarter of 2014

In [16]:
SalesDataPerstatusfirstquarter = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales_perstatus,
                                               count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               where CAST(otl.order_date AS DATE)>='2014-01-01' AND CAST(otl.order_date AS DATE)<='2014-03-31'
                                               group by ordermonth,otl.order_status 
                                               order by ordermonth,otl.order_status''')
SalesDataPerstatusfirstquarter.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatusfirstquarter", compression="none", header ='true')
SalesDataPerstatusfirstquarter.show(30,False)

##### Problem statement-6:Get monthly sales data for last quarter of 2013 with neither 'COMPLETE' nor 'CLOSED' orders

In [18]:
SalesDataPerstatuslastquarter = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales_perstatus,
                                               count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               where CAST(otl.order_date AS DATE)>='2013-10-01' AND CAST(otl.order_date AS DATE)<='2013-12-31'
                                               group by ordermonth,otl.order_status 
                                               having otl.order_status!='COMPLETE' AND otl.order_status!='CLOSED'
                                               order by ordermonth,otl.order_status''')
SalesDataPerstatuslastquarter.coalesce(1).write.csv("/FileStore/tables/SalesDataPerstatuslastquarter", compression="none", header ='true')
SalesDataPerstatuslastquarter.show(30,False)

###### Problem statement-7:Get monthly sales data for last quarter of 2013 and first quarter of 2014  with 'CANCELED' orders

In [20]:
SalesDatacanceledordersquarter = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales,round(avg(oitl.order_item_subtotal),2) as Averagesales,
                                               count(1) as Numberoforders,sum(oitl.order_item_quantity) as Totalquantity
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               where CAST(otl.order_date AS DATE)>='2013-10-01' AND CAST(otl.order_date AS DATE)<='2014-03-31'
                                               group by ordermonth,otl.order_status 
                                               having otl.order_status =='CANCELED'
                                               order by ordermonth,otl.order_status''')
SalesDatacanceledordersquarter.coalesce(1).write.csv("/FileStore/tables/SalesDatacanceledordersquarter", compression="none", header ='true')
SalesDatacanceledordersquarter.show(30,False)

###### Problem statement-8:Get monthly sales data for 'PENDING' orders

In [22]:
SalesDatapendingorder = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales,round(avg(oitl.order_item_subtotal),2) as Averagesales,
                                               count(1) as Numberoforders,sum(oitl.order_item_quantity) as Totalquantity
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               group by ordermonth,otl.order_status 
                                               having otl.order_status LIKE'%PEND%'
                                               order by ordermonth,otl.order_status''')
SalesDatapendingorder.coalesce(1).write.csv("/FileStore/tables/SalesDatapendingorder", compression="none", header ='true')
SalesDatapendingorder.show(30,False)

###### Problem statement-9:Get monthly sales data for 'PROCESSING' or ON_HOLD orders

In [24]:
SalesDataprocessingcanceledorders = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales,round(avg(oitl.order_item_subtotal),2) as Averagesales,
                                               count(1) as Numberoforders,sum(oitl.order_item_quantity) as Totalquantity
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               group by ordermonth,otl.order_status 
                                               having otl.order_status LIKE'%PROCESS%'OR otl.order_status LIKE'%HOLD%'
                                               order by ordermonth,otl.order_status''')
SalesDataprocessingcanceledorders.coalesce(1).write.csv("/FileStore/tables/SalesDataprocessingcanceledorders", compression="none", header ='true')
SalesDataprocessingcanceledorders.show(30,False)

###### Problem statement-10:Get monthly sales data with  'COMPLETE' AND 'CLOSED' orders

In [26]:
SalesDatacompleteandclosedorders = sqlContext.sql('''select substring(otl.order_date,0,7) as ordermonth, otl.order_status,
                                               round(sum(oitl.order_item_subtotal),2) as Totalsales_perstatus,
                                               count(1) as Numberoforder_perstatus,sum(oitl.order_item_quantity) as Totalquantity_perstatus
                                               from ordersTbl otl join orderItemsTbl oitl on otl.order_id=oitl.order_item_order_id 
                                               group by ordermonth,otl.order_status 
                                               having otl.order_status=='COMPLETE' OR otl.order_status=='CLOSED'
                                               order by ordermonth,otl.order_status''')
SalesDatacompleteandclosedorders.coalesce(1).write.csv("/FileStore/tables/SalesDatacompleteandclosedorders", compression="none", header ='true')
SalesDatacompleteandclosedorders.show(30,False)