d ##### Usecase:Get sales data by orders
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data per day
###### Problem statement-2:Get sales data per month
###### Problem statement-3:Get sales data per year
###### Problem statement-4:Get monthly sales data from third quarter of 2013 and first quarter of 2014 when Max sales >100 
###### Problem statement-5:Get monthly sales data from last quarter of 2013 where soldquantity_permonth > 25000 
###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where averagesale > 200 and Numberofsales >=600
###### Problem statement-7:Get daily sales data  where soldquantity_perday > 1500
###### Problem statement-8:Get final qurater of 2013 sales data of TOP5 revenue generating order for each month
###### Problem statement-9:Get average revenue per day and all the orders which are more than average.
###### Problem statement-10:Percentage of order items in Order revenue

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,round,avg,rank, col
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp, lit
spark = SparkSession.builder.master('local').appName('SalesDataForeachorder').enableHiveSupport().getOrCreate()

In [3]:
ordersdf = spark.read.csv("/FileStore/tables/retaildbtext/Orders.txt", sep=',',schema ='order_id int,order_date string,order_customer_id int, order_status string')
ordersdf.show(5,False)

In [4]:
orderItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Order_Items.txt", sep=',', schema='order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float')
orderItemsdf.show(5,False)

In [5]:
ordersMap = ordersdf.select('order_id','order_date')
ordersMap.show(5,False)


In [6]:
orderItemsMap=orderItemsdf.select('order_item_order_id','order_item_subtotal','order_item_quantity')
orderItemsMap.show(5,False)

In [7]:
ordersdfjoin = ordersMap.join(orderItemsMap, ordersMap.order_id == orderItemsMap.order_item_order_id,'inner')
ordersdfjoin.show(5,False)

In [8]:
ordersdfjoin.printSchema()

In [9]:
ordersdfSalesmap = ordersdfjoin.select('order_id','order_date','order_item_subtotal','order_item_quantity')
ordersdfSalesmap.show(5,False)

###### Problem statement-1:Get sales data per day

In [11]:
salesDatePerDay = ordersdfSalesmap.groupBy('order_date').agg(round(avg('order_item_subtotal'),2).alias("avgsales_perday"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_perday"),
                                                                count('order_item_subtotal').alias("Numberofsales_perday")).sort('order_date')
salesDatePerDay.coalesce(1).write.csv("/FileStore/tables/salesDatePerDay", compression="none", header ='true')
salesDatePerDay.show(10,False)

###### Problem statement-2: Get sales data  per month

In [13]:
SalesDataPerMonth = ordersdfSalesmap.groupBy(substring('order_date',1,7).alias('order_month')).agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_permonth"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth")).sort('order_month')
SalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonth", compression="none", header ='true')
SalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data  per year

In [15]:
salesPerEachYear = ordersdfSalesmap.groupBy(substring('order_date',1,4).alias('order_year')).agg(round(avg('order_item_subtotal'),2).alias("avg_sales_peryear"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_peryear"),
                                                                count('order_item_subtotal').alias("Numberofsales_peryear")).sort('order_year')
salesPerEachYear.coalesce(1).write.csv("/FileStore/tables/salesPerEachYear", compression="none", header="true")
salesPerEachYear.show(20,False)

###### Problem statement-4:Get monthly sales data from third quarter of 2013 and first quarter of 2014 when Max sales >100

In [17]:

SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfmonthlySalesmap.order_date) >=lit("2013-07-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2013-09-30")) |
                                                 (to_date(ordersdfmonthlySalesmap.order_date) >=lit("2014-01-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2014-03-31")))
SalesDataPerMonthquarter = SalesDataPerMonth.groupBy(substring('order_date',1,7).alias('order_month')).agg(round(avg('order_item_subtotal'),2).alias("avg_sales_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_permonth"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth"))                                                  
SalesDataPerMonthquarterfinal= SalesDataPerMonthquarter.filter(SalesDataPerMonthquarter.max_sales>=500.00).sort("order_month")
SalesDataPerMonthquarterfinal.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquarterfinal", compression="none", header="true")
SalesDataPerMonthquarterfinal.show(20,False)

###### Problem statement-5:Get monthly sales data from last quarter of 2013 where soldquantity_permonth>25000

In [19]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfmonthlySalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2013-12-31")))
SalesDataPerMonthquant = SalesDataPerMonth.groupBy(substring('order_date',1,7).alias('order_month')).agg(round(avg('order_item_subtotal'),2).alias("avg_sales_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_permonth"),
                                                                sum('order_item_quantity').alias("soldquantity_permonth"))                                                  
SalesDataPerMonthquantfinal= SalesDataPerMonthquant.filter(SalesDataPerMonthquant.soldquantity_permonth>=25000).sort("order_month")
SalesDataPerMonthquantfinal.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquantfina", compression="none", header="true")
SalesDataPerMonthquantfinal.show(20,False)

###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where averagesale > 200 and Numberofsales >=600

In [21]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfmonthlySalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2013-12-31")) |
                                                 (to_date(ordersdfmonthlySalesmap.order_date) >=lit("2014-04-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2014-07-31")))
SalesDataPerdayavgsales = SalesDataPerMonth.groupBy(substring('order_date',1,7).alias('order_month')).agg(round(avg('order_item_subtotal'),2).alias("avg_sales_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_permonth"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth"))                                                 
SalesDataPerdayavgsalesfinal= SalesDataPerdayavgsales.filter((SalesDataPerdayavgsales.avg_sales_permonth>=100) & (SalesDataPerdayavgsales.Numberofsales_permonth>=600)).sort("order_month")
SalesDataPerMonthquantfinal.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquantfina", compression="none", header="true")
SalesDataPerdayavgsalesfinal.show(20,False)

###### Problem statement-7:Get daily sales data  where soldquantity_perday > 1500

In [23]:
SalesDataPerdaysoldquant = ordersdfSalesmap.groupBy('order_date').agg(round(avg('order_item_subtotal'),2).alias("avg_sales_perday"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_perday"),
                                                                sum('order_item_quantity').alias("soldquantity_perday"))                                                  
SalesDataPerMonthquantfinal= SalesDataPerdaysoldquant.filter(SalesDataPerdaysoldquant.soldquantity_perday>=1500).sort("order_date")
SalesDataPerMonthquantfinal.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquantfina", compression="none", header="true")
SalesDataPerMonthquantfinal.show(20,False)

##### Problem statement-8:Get final qurater of 2013 sales data of TOP5 revenue generating orders for each month

In [25]:
SalesDataPerMonth=ordersdfSalesmap.where((to_date(ordersdfmonthlySalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(ordersdfmonthlySalesmap.order_date) <=lit("2013-12-31"))) 
top5SalesDataPerMonth= SalesDataPerMonth.groupBy(substring('order_date',1,7).alias('order_month'),'order_id').agg(round(avg('order_item_subtotal'),2).alias("avg_sales_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales_permonth"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth"))                                                 

window = Window.partitionBy(top5SalesDataPerMonth['order_month']).orderBy(top5SalesDataPerMonth['Totalsales_permonth'].desc())
top5orderspurchasedeachmonth=top5SalesDataPerMonth.select('order_month','order_id','Totalsales_permonth').withColumn('rank', rank().over(window).alias('rank'))
top5rankedorderseachmonth = top5orderspurchasedeachmonth.filter(col('rank')<=5).sort('order_month','Totalsales_permonth',ascending=[1,0])
top5rankedorderseachmonth.coalesce(1).write.csv("/FileStore/tables/top5rankedorderseachmonth", compression="none", header ='true')          
top5rankedorderseachmonth.show(20,False)


##### Problem statement-9:Get average revenue per day and all the orders which are more than average.

In [27]:
avgrevenueperday=ordersdfSalesmap.groupBy('order_date','order_id').agg(round(sum('order_item_subtotal'),2).alias('order_revenue')) 
ordersmorethanaverage = avgrevenueperday.withColumn('avg_revenue',avg('order_revenue').over(Window.partitionBy('order_date')))
ordersmorethanaveragefinal = ordersmorethanaverage.filter('order_revenue >= avg_revenue').orderBy('order_date',col('order_revenue').desc())
ordersmorethanaveragefinal.coalesce(1).write.csv("/FileStore/tables/ordersmorethanaveragefinal", compression="none", header ='true')
ordersmorethanaveragefinal.show()

##### Problem statement-10:Percentage of order items in Order revenue

In [29]:
percentageoforderItems = orderItemsdf.withColumn('order_revenue',sum('order_item_subtotal').over(Window.partitionBy('order_item_order_id'))). \
                              withColumn('percentage', round(col('order_item_subtotal')/col('order_revenue'), 2))
percentageoforderItemsfinal = percentageoforderItems.select('order_item_order_id', 'order_item_id','order_item_subtotal', round('order_revenue',2).alias('order_revenue'),'percentage'). \
                                                     orderBy(col('order_item_order_id'))
percentageoforderItemsfinal.coalesce(1).write.csv("/FileStore/tables/percentageoforderItemsfinal", compression="none", header ='true')
percentageoforderItemsfinal.show()