##### Usecase:Get sales data by products
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for products per day
###### Problem statement-2:Get sales data for products per month
###### Problem statement-3:Get sales data for products per year
###### Problem statement-4:Get monthly sales data from most sold products
###### Problem statement-5:Get third and final quarters of sales data from Top5 revenue generating products for each month
###### Problem statement-6:Get sales data for each product category
###### Problem statement-7:Get monthly sales data from Top5 revenue generating categories
###### Problem statement-8:Get highest product price for each category
###### Problem statement-9:Get the sales data of best-selling and the second best-selling products in every category
###### Problem statement-10:Get TOP5 highest selling product for each year

In [2]:
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import round,sum,avg,count,substring
hiveContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
hiveContext.sql("create database retail_db")
hiveContext.sql("use retail_db")

In [4]:
hiveContext.sql("create table orders(order_id int,order_date string,order_customer_id int, order_status string) row format delimited fields terminated by ','")
hiveContext.sql("create table order_items(order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float) row format delimited fields terminated by ','")
hiveContext.sql("create table customers(customer_id int, customer_fname string, customer_lname string, customer_email string,customer_password string,customer_street string,customer_city string, customer_state string, customer_zipcode string) row format delimited fields terminated by ','")
hiveContext.sql("create table products(product_id int, product_category_id int, product_name string, product_desc string, product_price float, product_image string) \ 
                                                           row format delimited fields terminated by ';'")

In [5]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Orders.txt' into table orders")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Order_Items.txt' into table order_items")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Customers.txt' into table customers")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Products.txt' into table customers")

In [6]:
hiveContext.sql("show tables").show()

In [7]:
hiveContext.sql("select * from orders").show(5,False)

In [8]:
hiveContext.sql("select * from orders").printSchema()

In [9]:
hiveContext.sql("select * from order_items").show(5,False)


In [10]:
hiveContext.sql("select * from order_items").printSchema()

In [11]:
hiveContext.sql("select * from customers").show(5,False)

In [12]:
hiveContext.sql("select * from customers").printSchema()

In [13]:
hiveContext.sql("select * from products").show(5)

In [14]:
hiveContext.sql("select * from products").printSchema()

###### Problem statement-1:Get sales data for products per day

In [16]:
productsSalesDataPerDay = hiveContext.sql('''select substring(otl.order_date,0,10) as orderdate,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue_perday,  
                                   count(oitl.order_item_subtotal) as NumberOforders_perday, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_perday 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   group by orderdate,pdtl.product_name 
                                   order by orderdate,pdtl.product_name''')
productsSalesDataPerDay.coalesce(1).write.csv("/FileStore/tables/productsSalesDataPerDay", compression="none", header ='true')
productsSalesDataPerDay.show(20,False)

###### Problem statement-2: Get sales data for products per month

In [18]:
productsSalesDataPerMonth = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue_permonth,  
                                   count(oitl.order_item_subtotal) as NumberOforders_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   group by ordermonth,pdtl.product_name 
                                   order by ordermonth,pdtl.product_name''')
productsSalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/productsSalesDataPerMonth", compression="none", header ='true')
productsSalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data  for products per year

In [20]:
productsSalesDataPerYear = hiveContext.sql('''select substring(otl.order_date,0,4) as orderyear,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue_peryear,  
                                   count(oitl.order_item_subtotal) as NumberOforders_peryear, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_peryear 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   group by orderyear, pdtl.product_name 
                                   order by orderyear, pdtl.product_name''')
productsSalesDataPerYear.coalesce(1).write.csv("/FileStore/tables/productsSalesDataPerYear", compression="none", header ='true')
productsSalesDataPerYear.show(20,False)

###### Problem statement-4:Get monthly sales data from most sold products between 2013-July and 2014-July

In [22]:
productsSalesDataformostsold = hiveContext.sql('''select * from (select substring(otl.order_date,0,7) as ordermonth,pdtl.product_name,
                                   sum(oitl.order_item_quantity) order_quantity,
                                   rank() over(partition by substring(otl.order_date,0,7) order by sum(oitl.order_item_quantity) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id
                                   group by substring(otl.order_date,0,7),pdtl.product_name) s 
                                   WHERE ranking_order=1 ORDER BY 1''')
productsSalesDataformostsold.coalesce(1).write.csv("/FileStore/tables/productsSalesDataformostsold", compression="none", header ='true')
productsSalesDataformostsold.show(20,False)

###### Problem statement-5:Get third and final quarters of sales data from Top5 revenue generating products for each month

In [24]:
productsSalesDatafortop5revenue = hiveContext.sql('''select * from (select substring(otl.order_date,0,7) as ordermonth,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue_permonth,  
                                   count(oitl.order_item_subtotal) as NumberOforders_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth,
                                   rank() over(partition by substring(otl.order_date,0,7) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id and CAST(otl.order_date AS DATE)>="2013-07-01" and CAST(otl.order_date AS DATE)<="2013-12-31"
                                   group by substring(otl.order_date,0,7),pdtl.product_name) s 
                                   WHERE ranking_order<=5 ORDER BY 1, 6''')
productsSalesDatafortop5revenue.coalesce(1).write.csv("/FileStore/tables/productsSalesDatafortop5revenue", compression="none", header ='true')
productsSalesDatafortop5revenue.show(30,False)

###### Problem statement-6:Get sales data for each product category

In [26]:
productsSalesDataPercategory = hiveContext.sql('''select CAST(pdtl.product_category_id AS INT) as categories,
                                   round(sum(oitl.order_item_subtotal),2) as orderrevenue,  
                                   count(oitl.order_item_subtotal) as NumberOforders, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   group by categories
                                   order by categories ''')
productsSalesDataPercategory.coalesce(1).write.csv("/FileStore/tables/productsSalesDataPercategory", compression="none", header ='true')
productsSalesDataPercategory.show(60,False)

###### Problem statement-7:Get monthly sales data from Top5 revenue generating categories

In [28]:
productsSalesDatatop5categories = hiveContext.sql('''select * from (select substring(otl.order_date,0,7) as ordermonth,pdtl.product_category_id as categories,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue_permonth,  
                                   count(oitl.order_item_subtotal) as NumberOforders_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth,
                                   rank() over(partition by substring(otl.order_date,0,7) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id 
                                   group by substring(otl.order_date,0,7),pdtl.product_category_id) s 
                                   WHERE ranking_order<=5 ORDER BY 1, 6''')
productsSalesDatatop5categories.coalesce(1).write.csv("/FileStore/tables/productsSalesDatatop5categories", compression="none", header ='true')
productsSalesDatatop5categories.show(30,False)

###### Problem statement-8:Get highest product price for each category

In [30]:
productsSalesDatahighprice = hiveContext.sql('''select CAST(pdtl.product_category_id AS INT) as categories,pdtl.product_name,
                                   max(CAST(pdtl.product_price AS float)) as highestproductprice
                                   from products pdtl join
                                    (SELECT CAST(product_category_id AS INT) as categories, max(CAST(product_price AS float)) as highestproductprice
                                      FROM products 
                                      GROUP BY categories ORDER BY categories  ) cag 
                                      on CAST(pdtl.product_category_id AS INT)=cag.categories
                                    where pdtl.product_price = cag.highestproductprice
                                   group by CAST(pdtl.product_category_id AS INT),pdtl.product_name
                                   order by CAST(pdtl.product_category_id AS INT) ''')
productsSalesDatahighprice.coalesce(1).write.csv("/FileStore/tables/productsSalesDatahighprice", compression="none", header ='true')
productsSalesDatahighprice.show(60,False)

###### Problem statement-9: Get the sales data of best-selling and the second best-selling products in every category

In [32]:
productSalesDatabstsellcategories = hiveContext.sql('''select * from (select CAST(pdtl.product_category_id AS INT) as categories,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue,  
                                   count(oitl.order_item_subtotal) as NumberOforders, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales,
                                   rank() over(partition by CAST(pdtl.product_category_id AS INT) order by round(sum(oitl.order_item_subtotal),2) desc) bestselling_order
                                   from orders otl,order_items oitl,products pdtl where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id 
                                   group by CAST(pdtl.product_category_id AS INT),pdtl.product_name) s 
                                   WHERE bestselling_order<=2 ORDER BY 1, 5 desc''')
productSalesDatabstsellcategories.coalesce(1).write.csv("/FileStore/tables/productSalesDatabstsellcategories", compression="none", header ='true')
productSalesDatabstsellcategories.show(30,False)

###### Problem statement-10:Get TOP5 highest selling product for each year

In [34]:
productsSalesDatatop5peryear = hiveContext.sql('''select * from (select substring(otl.order_date,0,4) as orderyear,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue,  
                                   count(oitl.order_item_subtotal) as NumberOforders, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales,
                                   rank() over(partition by substring(otl.order_date,0,4) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id 
                                   group by substring(otl.order_date,0,4),pdtl.product_name) t 
                                   WHERE ranking_order<=5 ORDER BY 1, 6 ''')
productsSalesDatatop5peryear.coalesce(1).write.csv("/FileStore/tables/productsSalesDatatop5peryear", compression="none", header ='true')
productsSalesDatatop5peryear.show(30,False)