d ##### Usecase:Get sales data by customers
###### Solutions are given for following problem statements
###### Problem statement-1:Get sales data for customers per day
###### Problem statement-2:Get sales data for customers per month
###### Problem statement-3:Get sales data for customers per year
###### Problem statement-4:Get monthly sales data from most visited customers
###### Problem statement-5:Get monthly sales data from Top10 revenue generating customers 
###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where customer_state is CA
###### Problem statement-7:Get monthly sales data from Top10 revenue generating cities
###### Problem statement-8:Get Top10 revenue generating zipcodes and its corresponding cities and states
###### Problem statement-9:Get revenue of  each zipcodes of state COLORADO and its corresponding cities 
###### Problem statement-10:Get total revenue of each state in last quarter of 2013
###### Problem statement-11:Get the list of inactive customers

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,round,avg,rank, col
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp, lit
spark = SparkSession.builder.master('local').appName('SalesDatabycustomers').enableHiveSupport().getOrCreate()

In [3]:
ordersdf = spark.read.csv("/FileStore/tables/retaildbtext/Orders.txt", sep=',',schema ='order_id int,order_date string,order_customer_id int, order_status string')
ordersdf.show(5,False)

In [4]:
orderItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Order_Items.txt", sep=',', schema='order_item_id int, order_item_order_id int, order_item_product_id  int, order_item_quantity int,order_item_subtotal float,order_item_product_price float')
orderItemsdf.show(5,False)

In [5]:
customerItemsdf = spark.read.csv("/FileStore/tables/retaildbtext/Customers.txt", sep=';', schema='customer_id int, customer_fname string, customer_lname string, customer_email string,customer_password string,customer_street string,customer_city string, customer_state string, customer_zipcode string')
customerItemsdf.show(5,False)

In [6]:
ordersdfMap = ordersdf.select('order_id','order_date','order_customer_id')
ordersdfMap.show(5,False)

In [7]:
orderItemsdfMap=orderItemsdf.select('order_item_order_id','order_item_product_id','order_item_subtotal','order_item_quantity')
orderItemsdfMap.show(5,False)

In [8]:
ordersdfjoin = ordersdfMap.join(orderItemsdfMap, ordersdfMap.order_id == orderItemsdfMap.order_item_order_id,'inner')
ordersdfjoin.show(5,False)


In [9]:
ordersdfSalesmap = ordersdfjoin.select('order_date','order_customer_id','order_item_product_id','order_item_subtotal','order_item_quantity')
ordersdfSalesmap.show(10,False)

In [10]:
customersdfjoin = ordersdfSalesmap.join(customerItemsdf, ordersdfSalesmap.order_customer_id == customerItemsdf.customer_id,'inner')
customersdfjoin.show(5,False)

In [11]:
customersalesmap=customersdfjoin.select('order_date','order_customer_id','order_item_product_id','order_item_subtotal','order_item_quantity','customer_fname','customer_lname','customer_city','customer_state','customer_zipcode')
customersalesmap.show(10,False)

###### Problem statement-1:Get sales data for customers per day

In [13]:
customersSalesDataPerDay = customersalesmap.groupBy('order_date','customer_fname', 'customer_lname').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('order_date','customer_fname', 'customer_lname','Numberofsales','Totalsales')
customersSalesDataPerDay.coalesce(1).write.csv("/FileStore/tables/customersSalesDataPerDay", compression="none", header ='true')
customersSalesDataPerDay.show(20,False)

###### Problem statement-2: Get sales data for customers per month

In [15]:
customerSalesDataPerMonth = customersalesmap.groupBy(substring('order_date',0,7).alias('ordermonth'),'customer_fname', 'customer_lname').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('ordermonth','customer_fname', 'customer_lname','Numberofsales','Totalsales')
customerSalesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/customerSalesDataPerMonth", compression="none", header ='true')
customerSalesDataPerMonth.show(20,False)

###### Problem statement-3: Get sales data  for customers per year

In [17]:
customerSalesDataPerYear = customersalesmap.groupBy(substring('order_date',0,4).alias('orderyear'),'customer_fname', 'customer_lname').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity"),
                                                                count('order_item_subtotal').alias("Numberofsales")).sort('orderyear','customer_fname', 'customer_lname','Numberofsales','Totalsales')
customerSalesDataPerYear.coalesce(1).write.csv("/FileStore/tables/customerSalesDataPerYear", compression="none", header ='true')
customerSalesDataPerYear.show(20,False)

###### Problem statement-4:Get monthly sales data from customers visited more than 100 times

In [19]:
customervisitDataPerMonth = customersalesmap.groupBy(substring('order_date',0,7).alias("ordermonth"),'customer_fname', 'customer_lname').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                count('order_customer_id').alias("customervisit_permonth"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales_perstatus")).\
                                                                filter(count('order_customer_id')>100).\
                                                                sort('ordermonth','customer_fname','customer_lname','customervisit_permonth')
customervisitDataPerMonth.coalesce(1).write.csv("/FileStore/tables/customervisitDataPerMonth", compression="none", header ='true')
customervisitDataPerMonth.show(20,False)

###### Problem statement-5:Get monthly sales data from Top10 revenue generating customers

In [21]:
customeDataTop10revenue = customersalesmap.groupBy('customer_fname', 'customer_lname').agg( round(sum('order_item_subtotal'),2).alias("customer_revenue")).sort(col('customer_revenue').desc()).limit(10)
customeDataTop10revenue.coalesce(1).write.csv("/FileStore/tables/customeDataTop10revenue", compression="none", header ='true')
customeDataTop10revenue.show(20,False)

###### Problem statement-6:Get daily sales data from fourth quarter of 2013 and second quarter of 2014 where and Numberofsales>=110 in state of CA

In [23]:
SalesDataPerMonth=customersalesmap.where((to_date(customersalesmap.order_date) >=lit("2013-10-01")) & 
                                                 (to_date(customersalesmap.order_date) <=lit("2013-12-31")) |
                                                 (to_date(customersalesmap.order_date) >=lit("2014-04-01")) & 
                                                 (to_date(customersalesmap.order_date) <=lit("2014-06-30")))
SalesDatacanceledordersquarter = SalesDataPerMonth.groupBy(substring('order_date',0,10).alias("orderdate"),'customer_state').agg(round(avg('order_item_subtotal'),2).alias("avgsales"), 
                                                                min('order_item_subtotal').alias("min_sales"),
                                                                max('order_item_subtotal').alias("max_sales"),
                                                                round(sum('order_item_subtotal'),2).alias("Totalsales"),
                                                                sum('order_item_quantity').alias("Totalquantity_perstatus"),
                                                                count('order_item_subtotal').alias("Numberofsales")).\
                                                                filter((count('order_item_subtotal')>=110) & (SalesDataPerMonth.customer_state == 'CA')).\
                                                                sort('orderdate')
SalesDatacanceledordersquarter.coalesce(1).write.csv("/FileStore/tables/SalesDatacanceledordersquarter", compression="none", header ='true')
SalesDatacanceledordersquarter.show(30,False)

###### Problem statement-7:Get Top10 revenue generating cities

In [25]:
customeDataTop10revcities = customersalesmap.groupBy('customer_city').agg( round(sum('order_item_subtotal'),2).alias("city_revenue")).sort(col('city_revenue').desc()).limit(10)
customeDataTop10revcities.coalesce(1).write.csv("/FileStore/tables/customeDataTop10revcities", compression="none", header ='true')
customeDataTop10revcities.show(20,False)

###### Problem statement-8:Get  Top10 revenue generating zipcodes and its corresponding cities and states

In [27]:
customeDataTop10revcities = customersalesmap.groupBy('customer_zipcode','customer_city','customer_state'). \
                            agg( round(sum('order_item_subtotal'),2).alias("zipcode_revenue")).sort(col('zipcode_revenue').desc()).limit(10)
customeDataTop10revcities.coalesce(1).write.csv("/FileStore/tables/customeDataTop10revcities", compression="none", header ='true')
customeDataTop10revcities.show(20,False)

###### Problem statement-9:Get revenue of  each zipcodes of state COLORADO and its corresponding cities

In [29]:
zipcoderevenueDataCOstate = customersalesmap.groupBy('customer_zipcode','customer_city','customer_state'). \
                            agg(round(sum('order_item_subtotal'),2).alias("zipcode_revenue")).filter(customersalesmap.customer_state=='CO').sort(col('customer_zipcode'))
zipcoderevenueDataCOstate.coalesce(1).write.csv("/FileStore/tables/zipcoderevenueDataCOstate", compression="none", header ='true')
zipcoderevenueDataCOstate.show(20,False)

###### Problem statement-10:Get total revenue of each state

In [31]:
SalesDatarevenueperstate = customersalesmap.groupBy('customer_state'). \
                            agg(round(sum('order_item_subtotal'),2).alias("state_revenue")).sort(col('customer_state'))
SalesDatarevenueperstate.coalesce(1).write.csv("/FileStore/tables/SalesDatarevenueperstate", compression="none", header ='true')
SalesDatarevenueperstate.show(55,False)

###### Problem statement-11:Get the list of inactive customers

In [33]:
listofInactivecustomers = customerItemsdf.join(ordersdf,customerItemsdf.customer_id == ordersdf.order_customer_id,"LeftAnti")
listofInactivecustomers.coalesce(1).write.csv("/FileStore/tables/listofInactivecustomers", compression="none", header ='true')
listofInactivecustomers.show(20,False)