d ##### Usecase:Get sales data by orders
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data per day
###### Problem statement-2:Get sales data per month
###### Problem statement-3:Get sales date per year
###### Problem statement-4:Get monthly sales data from third quarter of 2013 and first quarter of 2014 when Max sales >100 
###### Problem statement-5:Get monthly sales data from last quarter of 2013 where soldquantity_permonth > 25000 
###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where averagesale > 200 and Numberofsales >=600
###### Problem statement-7:Get daily sales data  where soldquantity_perday > 1500
###### Problem statement-8:Get final qurater of 2013 sales data of TOP5 revenue generating order for each month
###### Problem statement-9:Get average revenue per day and all the orders which are more than average.
###### Problem statement-10:Percentage of order items in Order revenue

In [2]:
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import round,sum,avg,count,substring
hiveContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
hiveContext.sql("create database retail_db")
hiveContext.sql("use retail_db")

In [4]:
hiveContext.sql("create table orders(order_id int,order_date string,order_customer_id int, order_status string) row format delimited fields terminated by ','")
hiveContext.sql("create table order_items(order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float) row format delimited fields terminated by ','")

In [5]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Orders.txt' into table orders")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Order_Items.txt' into table order_items")

In [6]:
hiveContext.sql("show tables").show()

In [7]:
hiveContext.sql("select * from orders").show(5,False)

In [8]:
hiveContext.sql("select * from order_items").show(5,False)


###### Problem statement-1:Get sales data per day

In [10]:
SalesDataPerDay = hiveContext.sql('''select cast(otl.order_date as DATE) as orderdate,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales, 
                                   max(oitl.order_item_subtotal) as Max_sales, 
                                   count(oitl.order_item_subtotal) as Numberofsales_perday, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_perday 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   group by otl.order_date 
                                   order by otl.order_date''')
SalesDataPerDay.coalesce(1).write.csv("/FileStore/tables/SalesDataPerDay", compression="none", header ='true')
SalesDataPerDay.show(20,False)

###### Problem statement-2: Get sales data per month

In [12]:
SalesDataPerMonth = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales, 
                                   max(oitl.order_item_subtotal) as Max_sales, 
                                   count(oitl.order_item_subtotal) as Numberofsales_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   group by ordermonth \
                                   order by ordermonth''')
SalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonth", compression="none", header ='true')
SalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data per year

In [14]:
SalesDataPerYear = hiveContext.sql('''select substring(otl.order_date,0,4) as orderyear,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales, 
                                   max(oitl.order_item_subtotal) as Max_sales, 
                                   count(oitl.order_item_subtotal) as Numberofsales_year, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_peryear 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   group by orderyear \
                                   order by orderyear''')
SalesDataPerYear.coalesce(1).write.csv("/FileStore/tables/SalesDataPerYear", compression="none", header ='true')
SalesDataPerYear.show(20,False)

###### Problem statement-4:Get monthly sales data from third quarter of 2013 and first quarter of 2014 when Max sales >100

In [16]:
SalesDataPerMonthquarter = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales, 
                                   max(oitl.order_item_subtotal) as Max_sales, 
                                   count(oitl.order_item_subtotal) as Numberofsales_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   where (CAST(otl.order_date AS DATE)>='2013-07-01' AND CAST(otl.order_date AS DATE)<='2013-09-30') 
                                   OR( CAST(otl.order_date AS DATE)>='2014-01-01'AND CAST(otl.order_date AS DATE)<='2014-03-31')
                                   group by ordermonth 
                                   having Max_sales > 100
                                   order by ordermonth''')
SalesDataPerMonthquarter.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquarter", compression="none", header ='true')
SalesDataPerMonthquarter.show(20,False)

###### Problem statement-5:Get monthly sales data from last quarter of 2013 where soldquantity_permonth>25000

In [18]:
SalesDataPerMonthquant = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales,  
                                   max(oitl.order_item_subtotal) as Max_sales,
                                   count(oitl.order_item_subtotal) as Numberofsales_permonth, 
                                   sum(oitl.order_item_quantity) as  soldquantity_permonth,
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   where CAST(otl.order_date AS DATE)>='2013-10-01' AND CAST(otl.order_date AS DATE)<='2013-12-31'
                                   group by ordermonth
                                   having soldquantity_permonth>25000
                                   order by ordermonth''')
SalesDataPerMonthquant.coalesce(1).write.csv("/FileStore/tables/SalesDataPerMonthquant", compression="none", header ='true')
SalesDataPerMonthquant.show(20,False)

###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where averagesale > 200 and Numberofsales >=600

In [20]:
SalesDataPerdayavgsales = hiveContext.sql('''select substring(otl.order_date,0,10) as orderdate,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales, 
                                   max(oitl.order_item_subtotal) as Max_sales, 
                                   count(oitl.order_item_subtotal) as Numberofsales, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   where (CAST(otl.order_date AS DATE)>='2013-10-01' AND CAST(otl.order_date AS DATE)<='2013-12-31') 
                                   OR( CAST(otl.order_date AS DATE)>='2014-04-01' AND CAST(otl.order_date AS DATE)<='2014-06-30')
                                   group by orderdate 
                                   having avgsales > 200 and Numberofsales >=600
                                   order by orderdate''')
SalesDataPerdayavgsales.coalesce(1).write.csv("/FileStore/tables/SalesDataPerdayavgsales", compression="none", header ='true')
SalesDataPerdayavgsales.show(20,False)

###### Problem statement-7:Get daily sales data  where soldquantity_perday > 1500

In [22]:
SalesDataPerdaysoldquant = hiveContext.sql('''select substring(otl.order_date,0,10) as orderdate,round(sum(oitl.order_item_subtotal),2) as Totalrevenue, 
                                   min(oitl.order_item_subtotal) as Min_sales,  
                                   max(oitl.order_item_subtotal) as Max_sales,
                                   count(oitl.order_item_subtotal) as Numberofsales_perday, 
                                   sum(oitl.order_item_quantity) as  soldquantity_perday,
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_perday 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   group by orderdate
                                   having soldquantity_perday>1500
                                   order by orderdate''')
SalesDataPerdaysoldquant.coalesce(1).write.csv("/FileStore/tables/SalesDataPerdaysoldquant", compression="none", header ='true')
SalesDataPerdaysoldquant.show(25,False)

###### Problem statement-8:Get final qurater of 2013 sales data of TOP5 revenue generating order for each month

In [24]:
SalesDataPermonthquarterly =  hiveContext.sql('''select * from(select substring(otl.order_date,0,7) as ordermonth,otl.order_id,round(sum(oitl.order_item_subtotal),2) as Totalrevenue,               
                                   rank() over(partition by substring(otl.order_date,0,7) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id 
                                   where (CAST(otl.order_date AS DATE)>='2013-10-01' AND CAST(otl.order_date AS DATE)<='2013-12-31') 
                                   group by otl.order_id,otl.order_date 
                                   order by ordermonth )
                                   where ranking_order<=5 order by 1,3 desc''')
SalesDataPermonthquarterly.coalesce(1).write.csv("/FileStore/tables/SalesDataPermonthquarterly", compression="none", header ='true')
SalesDataPermonthquarterly.show(20,False)

###### Problem statement-9:Get average revenue per day and all the orders which are more than average.

In [26]:
SalesDataPerdayavgsales =  hiveContext.sql('''SELECT order_date, order_id, round(order_revenue,2), round(avg_revenue,2) FROM 
                                          (SELECT otl.order_date, otl.order_id, sum(oitl.order_item_subtotal) order_revenue, 
                                           AVG( sum(oitl.order_item_subtotal)) OVER (PARTITION BY otl.order_date ) avg_revenue  
                                           from orders otl join order_items oitl WHERE otl.order_id = oitl.order_item_order_id GROUP BY  otl.order_date, otl.order_id) 
                                           WHERE order_revenue >= avg_revenue ORDER BY 1, 3 DESC''')
SalesDataPerdayavgsales.coalesce(1).write.csv("/FileStore/tables/SalesDataPerdayavgsales", compression="none", header ='true')
SalesDataPerdayavgsales.show(20,False)

###### Problem statement-10:Percentage of order items in Order revenue

In [28]:
SalesDataPcntorders =  hiveContext.sql('''SELECT order_item_order_id, order_item_id, order_item_subtotal, 
                                            round(SUM(order_item_subtotal) OVER(PARTITION BY order_item_order_id),2) order_revenue , 
                                           (round(order_item_subtotal /SUM(order_item_subtotal) OVER(PARTITION BY order_item_order_id),2)) percentage FROM order_items ORDER BY 1''')
SalesDataPcntorders.coalesce(1).write.csv("/FileStore/tables/SalesDataPcntorders", compression="none", header ='true')
SalesDataPcntorders.show(20,False)