d ##### Usecase:Get sales data by departments
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for departments per day
###### Problem statement-2:Get sales data for departments per month
###### Problem statement-3:Get sales data for departments per year
###### Problem statement-4:Get monthly sales data from TOP5 revenue generatin departments
###### Problem statement-5:Get third and final quarters of 2013 sales data from TOP5 revenue generating products for each month with rankings
###### Problem statement-6:Get sales data for Men's product
###### Problem statement-7:Get monthly sales data from kids and women categories for first two quarters of 2014 
###### Problem statement-8::Get monthly sales data of TOP10 revenue generating Nike and adidas products with rankings
###### Problem statement-9:Get quarterly sales data of golf products for 2013 final quarter
###### Problem statement-10:Get sales data of TOP3 Apparel products for each year

In [2]:
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import round,sum,avg,count,substring
hiveContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
hiveContext.sql("create database retail_db")
hiveContext.sql("use retail_db")

In [4]:
hiveContext.sql("create table orders(order_id int,order_date string,order_customer_id int, order_status string) row format delimited fields terminated by ','")
hiveContext.sql("create table order_items(order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float) \ 
                   row format delimited fields terminated by ','")
hiveContext.sql("create table customers(customer_id int, customer_fname string, customer_lname string, customer_email string,customer_password string,customer_street string,customer_city string, \ 
                 customer_state string, customer_zipcode string) row format delimited fields terminated by ';'")
hiveContext.sql("create table products(product_id int, product_category_id int, product_name string, product_desc string, product_price float, product_image string) \ 
                                                           row format delimited fields terminated by ';'")
hiveContext.sql("create table departments(department_id int, department_name string) row format delimited fields terminated by ','")
hiveContext.sql("create table categories(category_id int, category_department_id  int, category_name string) row format delimited fields terminated by ','")

In [5]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Orders.txt' into table orders")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Order_Items.txt' into table order_items")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Customers.txt' into table customers")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Products.txt' into table products")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Departments.txt' into table departments")
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/retaildbtext/Categories.txt' into table categories")                

In [6]:

hiveContext.sql("show tables").show()

In [7]:
hiveContext.sql("select * from orders").show(5,False)

In [8]:
hiveContext.sql("select * from orders").printSchema()

In [9]:
hiveContext.sql("select * from order_items").show(5,False)


In [10]:
hiveContext.sql("select * from order_items").printSchema()

In [11]:
hiveContext.sql("select * from customers").show(5,False)

In [12]:
hiveContext.sql("select * from customers").printSchema()

In [13]:
hiveContext.sql("select * from products").show(5)

In [14]:
hiveContext.sql("select * from products").printSchema()

In [15]:
hiveContext.sql("select * from categories").show(5)

In [16]:
hiveContext.sql("select * from categories").printSchema()

In [17]:
hiveContext.sql("select * from departments").show(5)

In [18]:
hiveContext.sql("select * from departments").printSchema()

###### Problem statement-1:Get sales data for departments per day

In [20]:
departmentsSalesDataPerDay = hiveContext.sql('''select substring(otl.order_date,0,10) as orderdate,dept.department_name,
                                   round(sum(oitl.order_item_subtotal),2) as deptrevenue_perday,  
                                   count(oitl.order_item_subtotal) as NumberOforders_perday, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_perday 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   group by orderdate,dept.department_name 
                                   order by orderdate,dept.department_name''')
departmentsSalesDataPerDay.coalesce(1).write.csv("/FileStore/tables/productsSalesDataPerDay", compression="none", header ='true')
departmentsSalesDataPerDay.show(20,False)

###### Problem statement-2: Get sales data for departments per month

In [22]:
departmentsSalesDataPerMonth = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,dept.department_name,
                                   round(sum(oitl.order_item_subtotal),2) as deptrevenue_permonth,  
                                   count(oitl.order_item_subtotal) as NumberOforders_permonth, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_permonth 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   group by ordermonth,dept.department_name 
                                   order by ordermonth,dept.department_name''')
departmentsSalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerMonth", compression="none", header ='true')
departmentsSalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data  for departments per year

In [24]:
departmentsSalesDataPerYear = hiveContext.sql('''select substring(otl.order_date,0,4) as orderyear,dept.department_name,
                                   round(sum(oitl.order_item_subtotal),2) as deptrevenue_peryear,  
                                   count(oitl.order_item_subtotal) as NumberOforders_peryear, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales_peryear 
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   group by orderyear,dept.department_name 
                                   order by orderyear,dept.department_name''')
departmentsSalesDataPerYear.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerYear", compression="none", header ='true')
departmentsSalesDataPerYear.show(20,False)

###### Problem statement-4:Get sales data from TOP5 reveue generating departments

In [26]:
departmentsSalesDatafortop5dept = hiveContext.sql('''select dept.department_name,
                                   round(sum(oitl.order_item_subtotal),2) as deptrevenue,sum(oitl.order_item_quantity) order_quantity
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   group by dept.department_name
                                   order by deptrevenue desc''')
departmentsSalesDatafortop5dept.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDatafortop5dept", compression="none", header ='true')
departmentsSalesDatafortop5dept.show(20,False)

###### Problem statement-5:Get third and final quarters of 2013 sales data from TOP5 revenue generating products for each month with rankings

In [28]:
departmentSalesDatafortop5perquart = hiveContext.sql('''select * from (select substring(otl.order_date,0,7) as ordermonth,dept.department_name,
                                   round(sum(oitl.order_item_subtotal),2) as deptrevenue,
                                   rank() over(partition by substring(otl.order_date,0,7) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl,categories catg, departments dept where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id and catg.category_id = pdtl.product_category_id 
                                   and dept.department_id =  catg.category_department_id and CAST(otl.order_date AS DATE)>='2013-07-01' 
                                   and CAST(otl.order_date AS DATE)<='2013-12-31'
                                   group by substring(otl.order_date,0,7),dept.department_name) t
                                    ORDER BY 1,3 desc''')
departmentSalesDatafortop5perquart.coalesce(1).write.csv("/FileStore/tables/departmentSalesDatafortop5perquart", compression="none", header ='true')
departmentSalesDatafortop5perquart.show(36,False)

###### Problem statement-6:Get sales data for Men's product

In [30]:
departmentsSalesDataforMensprod = hiveContext.sql('''select dept.department_name,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as productrevenue,sum(oitl.order_item_quantity) order_quantity
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   WHERE pdtl.product_name LIKE '%Men%'
                                   group by dept.department_name,pdtl.product_name
                                   order by productrevenue desc''')
departmentsSalesDataforMensprod.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataforMensprod", compression="none", header ='true')
departmentsSalesDataforMensprod .show(20,False)

###### Problem statement-7:Get monthly sales data from kids and women categories for first two quarters of 2014

In [32]:
departmentsSalesDataforkidwomen = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,dept.department_name,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as productrevenue,sum(oitl.order_item_quantity) order_quantity
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   WHERE(pdtl.product_name LIKE '%Kid%' or pdtl.product_name LIKE '%Women%')and
                                   (CAST(otl.order_date AS DATE)>="2014-01-01" AND CAST(otl.order_date AS DATE)<='2014-06-30')
                                   group by ordermonth,dept.department_name,pdtl.product_name
                                   order by ordermonth,productrevenue desc''')
departmentsSalesDataforkidwomen.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataforkidwomen", compression="none", header ='true')
departmentsSalesDataforkidwomen.show(30,False)

###### Problem statement-8:Get monthly sales data of TOP10 revenue generating Nike and adidas products with rankings

In [34]:
departSalesDatafortop10 = hiveContext.sql('''select * from (select substring(otl.order_date,0,7) as ordermonth,pdtl.product_name,
                                           round(sum(oitl.order_item_subtotal),2) as deptrevenue,
                                   rank() over(partition by substring(otl.order_date,0,7) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl,categories catg, departments dept where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id and catg.category_id = pdtl.product_category_id
                                   and dept.department_id =  catg.category_department_id and (pdtl.product_name LIKE '%Nike%' or pdtl.product_name LIKE '%adidas%')
                                   group by substring(otl.order_date,0,7),pdtl.product_name) t
                                   WHERE ranking_order<=10 ORDER BY 1,4''')
departSalesDatafortop10.coalesce(1).write.csv("/FileStore/tables/departSalesDatafortop10", compression="none", header ='true')
departSalesDatafortop10.show(30,False)

###### Problem statement-9:Get quarterly sales data of golf products for 2013 final quarter

In [36]:
golfprodSalesData2013fquart = hiveContext.sql('''select substring(otl.order_date,0,7) as ordermonth,dept.department_name,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as productrevenue,sum(oitl.order_item_quantity) order_quantity
                                   from orders otl join order_items oitl on otl.order_id=oitl.order_item_order_id
                                   join products pdtl on oitl.order_item_product_id = pdtl.product_id
                                   join categories catg on catg.category_id = pdtl.product_category_id
                                   join departments dept on dept.department_id =  catg.category_department_id
                                   WHERE(pdtl.product_name LIKE '%Golf%')and
                                   (CAST(otl.order_date AS DATE)>="2013-10-01" AND CAST(otl.order_date AS DATE)<='2013-12-31')
                                   group by ordermonth,dept.department_name,pdtl.product_name
                                   order by ordermonth,productrevenue desc''')
golfprodSalesData2013fquart.coalesce(1).write.csv("/FileStore/tables/golfprodSalesData2013fquart", compression="none", header ='true')
golfprodSalesData2013fquart.show(30,False)

###### Problem statement-10:Get sales data of TOP3 Apparel products for each year

In [38]:
apparelSalesDatatop5peryear = hiveContext.sql('''select * from (select substring(otl.order_date,0,4) as orderyear,dept.department_name,pdtl.product_name,
                                   round(sum(oitl.order_item_subtotal),2) as Totalrevenue,  
                                   count(oitl.order_item_subtotal) as NumberOforders, 
                                   round(avg(oitl.order_item_subtotal),2) as avgsales,
                                   rank() over(partition by substring(otl.order_date,0,4) order by round(sum(oitl.order_item_subtotal),2) desc) ranking_order
                                   from orders otl,order_items oitl,products pdtl,categories catg, departments dept where otl.order_id=oitl.order_item_order_id
                                   and oitl.order_item_product_id = pdtl.product_id and catg.category_id = pdtl.product_category_id 
                                   and dept.department_id =  catg.category_department_id and dept.department_name = "Apparel"
                                   group by substring(otl.order_date,0,4),dept.department_name,pdtl.product_name) t 
                                   WHERE ranking_order<=5 ORDER BY 1, 7 ''')
apparelSalesDatatop5peryear.coalesce(1).write.csv("/FileStore/tables/apparelSalesDatatop5peryear", compression="none", header ='true')
apparelSalesDatatop5peryear.show(30,False)