d ##### Usecase:Get sales data by departments
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for departments per day
###### Problem statement-2:Get sales data for departments per month
###### Problem statement-3:Get sales data for departments per year
###### Problem statement-4:Get monthly sales data from TOP5 revenue generatin departments
###### Problem statement-5:Get third and final quarters of 2013 sales data from TOP5 revenue generating products for each month with rankings
###### Problem statement-6:Get sales data for Men's product
###### Problem statement-7:Get monthly sales data from kids and women categories for first two quarters of 2014 
###### Problem statement-8::Get monthly sales data of TOP10 revenue generating Nike and adidas products with rankings
###### Problem statement-9:Get quarterly sales data of golf products for 2013 final quarter
###### Problem statement-10:Get sales data of TOP3 Apparel products for each year

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,round,avg,rank, col
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp, lit
spark = SparkSession.builder.master('local').appName('SalesDatabydepartments').enableHiveSupport().getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions",10)

In [3]:
ordersdf = spark.read.csv("/FileStore/tables/retaildbtext/Orders.txt", sep=',',schema ='order_id int,order_date string,order_customer_id int, order_status string')
ordersdf.show(5,False)

In [4]:
orderItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Order_Items.txt", sep=',', schema='order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float')
orderItemsdf.show(5,False)

In [5]:
customerItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Customers.txt", sep=';', schema='customer_id int, customer_fname string, customer_lname string, customer_email string,customer_password string,customer_street string,customer_city string, customer_state string, customer_zipcode string')
customerItemsdf.show(5,False)

In [6]:
productItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Products.txt", sep=';', schema='product_id int, product_category_id int, product_name string, product_desc string, product_price float, product_image string ')
productItemsdf.show(5,False)

In [7]:
categoriesdf = spark.read.csv("/FileStore/tables/retaildbtext/Categories.txt", sep=',', schema='category_id int, category_department_id  int, category_name string')
categoriesdf.show(5,False)

In [8]:
departmentsdf = spark.read.csv("/FileStore/tables/retaildbtext/Departments.txt", sep=',', schema='department_id int, department_name string')
departmentsdf.show(5,False)

In [9]:
ordersMap = ordersdf.select('order_id','order_date','order_status')
ordersMap.show(5,False)

In [10]:
orderItemsMap=orderItemsdf.select('order_item_order_id','order_item_product_id','order_item_subtotal','order_item_quantity')
orderItemsMap.show(5,False)

In [11]:
ordersdfjoin = ordersMap.join(orderItemsMap, ordersMap.order_id == orderItemsMap.order_item_order_id,'inner')
ordersdfjoin.show(5,False)


In [12]:
ordersdfSalesmap = ordersdfjoin.select('order_id','order_date','order_item_product_id','order_status','order_item_subtotal','order_item_quantity')
ordersdfSalesmap.show(10,False)

In [13]:
productsMap = productItemsdf.select('product_id','product_category_id','product_name','product_price')
productsMap.show(5,False)

In [14]:
prodordersdfjoin = ordersdfSalesmap.join(productsMap, ordersdfSalesmap.order_item_product_id == productsMap.product_id,'inner')
prodordersdfjoin.show(5,False)

In [15]:
prodordersdfsalesmap =prodordersdfjoin.select('order_id','order_date','order_status','order_item_subtotal','order_item_quantity','product_id','product_category_id','product_name','product_price')
prodordersdfsalesmap.show(5,False)

In [16]:
prodcatdfsalesjoin = prodordersdfsalesmap.join(categoriesdf, prodordersdfsalesmap.product_category_id == categoriesdf.category_id,'inner')
prodcatdfsalesjoin.show(5,False)

In [17]:
prodordersdfsalesmap=prodcatdfsalesjoin.select('order_id','order_date','order_status','order_item_subtotal','order_item_quantity','product_id','product_category_id','product_name',\
                                               'product_price','category_id','category_department_id','category_name')
prodordersdfsalesmap.show(5,False)

In [18]:
prodcatdfsalesjoin = prodordersdfsalesmap.join(departmentsdf, prodordersdfsalesmap.category_department_id == departmentsdf.department_id,'inner')
prodcatdfsalesjoin.show(5,False)

In [19]:
deptprodordersdfsalesmap=prodcatdfsalesjoin.select('order_id','order_date','order_status','order_item_subtotal','order_item_quantity','product_id','product_category_id','product_name',\
                                               'product_price','category_id','category_department_id','category_name','department_id','department_name')
deptprodordersdfsalesmap.show(5,False)

In [20]:
deptprodordersdfsalesmap.printSchema()

###### Problem statement-1:Get sales data for departments per day

In [22]:
departmentsSalesDataPerDay = deptprodordersdfsalesmap.groupBy('order_date','department_name').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('order_date','department_name')
departmentsSalesDataPerDay.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerDay", compression="none", header ='true')
departmentsSalesDataPerDay.show(20,False)

###### Problem statement-2: Get sales data for departments per month

In [24]:
departmentsSalesDataPerMonth =deptprodordersdfsalesmap.groupBy(substring('order_date',0,7).alias("ordermonth"),'department_name').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('ordermonth','department_name')
departmentsSalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerMonth", compression="none", header ='true')
departmentsSalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data  for departments per year

In [26]:
departmentsSalesDataPerYear = deptprodordersdfsalesmap.groupBy(substring('order_date',0,4).alias("orderyear"),'department_name').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('orderyear','department_name')
departmentsSalesDataPerYear.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerYear", compression="none", header ='true')
departmentsSalesDataPerYear.show(20,False)

###### Problem statement-4:Get sales data from TOP5 reveue generating departments

In [28]:
departmentsSalesDatafortop5dept = deptprodordersdfsalesmap.groupBy('department_name').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("deptrevenue"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('deptrevenue',ascending=[0]).limit(5)
departmentsSalesDatafortop5dept.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDatafortop5dept", compression="none", header ='true')
departmentsSalesDatafortop5dept.show(5,False)

###### Problem statement-5:Get third and final quarters of 2013 sales data from TOP5 revenue generating departments for each month with rankings

In [30]:
SalesDataPerMonth=deptprodordersdfsalesmap.where((to_date(deptprodordersdfsalesmap.order_date) >=lit("2013-07-01")) & 
                                                 (to_date(deptprodordersdfsalesmap.order_date) <=lit("2013-12-31")))


departmentSalesDatafortop5perquart= SalesDataPerMonth.groupBy(substring('order_date',1,7).alias('order_month'),'department_name').agg(round(avg('order_item_subtotal'),2).alias("avg_sales_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("deptrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales_permonth"))                                                 

window = Window.partitionBy(departmentSalesDatafortop5perquart['order_month']).orderBy(departmentSalesDatafortop5perquart['deptrevenue'].desc())
top5deptdataeachmonth=departmentSalesDatafortop5perquart.select('order_month','department_name','deptrevenue').withColumn('rank', rank().over(window).alias('rank'))
top5rankeddeptdataeachmonth = top5deptdataeachmonth.filter(col('rank')<=5).sort('order_month','deptrevenue',ascending=[1,0])
top5rankeddeptdataeachmonth.coalesce(1).write.csv("/FileStore/tables/top5rankedorderseachmonth", compression="none", header ='true')          
top5rankeddeptdataeachmonth.show(20,False)
                               

                                 

###### Problem statement-6:Get sales data for Men's product

In [32]:
departmentsSalesDataforMensprod= deptprodordersdfsalesmap.groupBy('department_name','product_name').agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("productrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter(deptprodordersdfsalesmap.product_name.contains("Men")).\
                                                                sort("productrevenue", ascending=[0])
departmentsSalesDataforMensprod.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataforMensprod", compression="none", header ='true')
departmentsSalesDataforMensprod.show(20,False)

###### Problem statement-7:Get monthly sales data from kids and women categories for first two quarters of 2014

In [34]:
SalesDataPerMonth=deptprodordersdfsalesmap.where((to_date(deptprodordersdfsalesmap.order_date) >=lit("2014-01-01")) & 
                                                 (to_date(deptprodordersdfsalesmap.order_date) <=lit("2014-06-30")))

departmentsSalesDataforkidwomen= SalesDataPerMonth.groupBy(substring('order_date',0,7).alias('ordermonth'),'department_name','product_name').\
                                                                agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("productrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter(deptprodordersdfsalesmap.product_name.contains("Women")|deptprodordersdfsalesmap.product_name.contains("Kid")).\
                                                                sort('ordermonth',"productrevenue", ascending=[1,0])
departmentsSalesDataforkidwomen.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataforkidwomen", compression="none", header ='true')
departmentsSalesDataforkidwomen.show(30,False)

###### Problem statement-8:Get monthly sales data of TOP10 revenue generating Nike and adidas products with rankings

In [36]:
departSalesDatafortop10 = deptprodordersdfsalesmap.groupBy(substring('order_date',0,7).alias('ordermonth'),'department_name','product_name').\
                                                                agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("productrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter(deptprodordersdfsalesmap.product_name.contains("Nike")|deptprodordersdfsalesmap.product_name.contains("adidas")).\
                                                                sort('ordermonth',"productrevenue", ascending=[1,0])

window = Window.partitionBy(departSalesDatafortop10['ordermonth']).orderBy(departSalesDatafortop10['productrevenue'].desc())
top10deptdataeachmonth=departSalesDatafortop10.select('ordermonth','department_name','productrevenue').withColumn('rank', rank().over(window).alias('rank'))
top10rankeddeptdataeachmonth = top10deptdataeachmonth.filter(col('rank')<=10).sort('ordermonth','productrevenue',ascending=[1,0])

top10rankeddeptdataeachmonth.coalesce(1).write.csv("/FileStore/tables/top10rankeddeptdataeachmonth", compression="none", header ='true')
top10rankeddeptdataeachmonth.show(30,False)

###### Problem statement-9:Get quarterly sales data of golf products for 2013 final quarter

In [38]:
SalesDataPerMonth=deptprodordersdfsalesmap.where((to_date(deptprodordersdfsalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(deptprodordersdfsalesmap.order_date) <=lit("2013-12-31")))

departmentsSalesDataforkidwomen= SalesDataPerMonth.groupBy(substring('order_date',0,7).alias('ordermonth'),'department_name','product_name').\
                                                                agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("productrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter(deptprodordersdfsalesmap.product_name.contains("Golf")).\
                                                                sort('ordermonth',"productrevenue", ascending=[1,0])
departmentsSalesDataforkidwomen.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataforkidwomen", compression="none", header ='true')
departmentsSalesDataforkidwomen.show(30,False)

###### Problem statement-10:Get sales data of TOP3 Apparel products for each year

In [40]:
apparelSalesDatatop3peryear = deptprodordersdfsalesmap.groupBy(substring('order_date',0,4).alias('orderyear'),'department_name','product_name').\
                                                                agg(round(avg('order_item_subtotal'),2).alias("avg_sales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("productrevenue"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter(deptprodordersdfsalesmap.department_name=="Apparel").\
                                                                sort('orderyear',"productrevenue", ascending=[1,0])

window = Window.partitionBy(apparelSalesDatatop3peryear['orderyear']).orderBy(apparelSalesDatatop3peryear['productrevenue'].desc())
top3proddataeachmonth=apparelSalesDatatop3peryear.select('orderyear','department_name','product_name','Numberofsales','avg_sales','productrevenue').withColumn('rank', rank().over(window).alias('rank'))
top3rankedproddataeachmonth = top5proddataeachmonth.filter(col('rank')<=5).sort('orderyear','productrevenue',ascending=[1,0])
top3rankedproddataeachmonth.coalesce(1).write.csv("/FileStore/tables/top3rankedproddataeachmonth", compression="none", header ='true')
top3rankedproddataeachmonth.show(30,False)