In [None]:
from pyspark.sql import SparkSession

In [None]:
!hdfs dfs -ls /user/indirameduri/import-all


In [None]:
spark = SparkSession.builder.appName('AnoosaShetty').config('spark.ui.port', '0').master('yarn').getOrCreate()
spark.conf.set('spark.sql.shuffle.partitions', '2')

In [None]:
spark

**1) Most selling product (But Quantity not by Cost) for every month in the database (Between July 2013 to July 2014)**
**2) Who are the top 10 revenue generating customers?**
**3) What are the top 10 revenue generating products?**
**4) Top 5 revenue generating deparments;**
**5) Top 5 revenue generating cities (from address of Customers)**

**Please provide answers in Scala or Python or Spark SQL**

In [None]:
!ls

In [None]:
orders = spark.read.format('csv').load('/user/indirameduri/import-all/orders',inferSchema='True').toDF('order_id','order_date','order_customer_id','order_status')

In [None]:
orders.printSchema()

In [None]:
categories = spark.read.csv('/user/indirameduri/import-all/categories',inferSchema='true').toDF('category_id','category_dept_id','category_name')

In [None]:
categories.printSchema()

In [None]:
categories.first()

In [None]:
departments = spark.read.csv('/user/indirameduri/import-all/departments',inferSchema='true').toDF('department_id','department_name')

In [None]:
departments.printSchema()

In [None]:
departments.show()

In [None]:
products = spark.read.csv('/user/indirameduri/import-all/products',inferSchema='true').toDF('product_id','prod_category_id', \
                                                                                           'product_name','product_desc','product_price','product_image')

In [None]:
products.show()

In [None]:
products.printSchema()

In [None]:
order_items = spark.read.csv('/user/indirameduri/import-all/order_items').toDF('order_item_id','oi_order_id','oi_product_id','oi_quantity','oi_subtotal','oi_product_price')

In [None]:
order_items.show()

In [None]:
customers=spark.read.csv('/user/indirameduri/import-all/customers',inferSchema='true').toDF('customer_id','customer_fname','customer_lname','customer_email','customer_password','customer_street','customer_city','customer_state','customer_zipcode')

In [None]:
customers.show()

In [None]:
customers.printSchema()

***1) Most selling product (But Quantity not by Cost) for every month in the database (Between July 2013 to July 2014)***

In [None]:
#join between orders, order_items, products,  with group by on products and count of qty

tJoins = orders.join(order_items,orders.order_id==order_items.oi_order_id).join(products,order_items.oi_product_id == products.product_id).filter(orders.order_date.between("2013-07%","2014-07%"))
                                                                                                                                                  

In [None]:
tJoins.show()
tJoins.count()

In [None]:

tJoins.groupBy('product_name').count().sort('count',ascending=False).show()

In [None]:
#using spark sql
from pyspark.sql.functions import date_format
tJoins.createOrReplaceTempView('data')
mostSoldProduct = spark.sql('select count(oi_quantity) count,product_name from data group by product_name order by count(oi_quantity) desc')
mostSoldProduct.show()

In [None]:
spark.sql('desc data').show()

**2) Who are the top 10 revenue generating customers?**

In [None]:
#sum(subtotal) , joins between customer.customer_id = orders.orders_customer_id and orders.order_id = order_items.oi_order_id
#group by customer_id
from pyspark.sql.functions import round
data = order_items.join(orders, orders.order_id==order_items.oi_order_id).join(customers, customers.customer_id==orders.order_customer_id)
data.show()

In [None]:
data.printSchema()

In [None]:
import pyspark.sql.functions as f 
topCustomers=data.groupBy(data.customer_id).agg(f.round(f.sum("oi_subtotal"),2).alias('Total Amount')).orderBy('Total Amount',ascending=False).limit(10)
topCustomers.show()

In [None]:
topNCustomers = data.groupBy(data.customer_id).agg({"oi_subtotal":"sum"}).limit(10)
topNCustomers.show()

In [None]:
#With spark sql
from pyspark.sql.functions import round,sum
topCustomers_sql = spark.sql('''select data.order_customer_id,round(sum(data.oi_subtotal),2) Total_Amount from  data
                                group by data.order_customer_id
                                order by round(sum(data.oi_subtotal),2) desc
                                limit 10''')

topCustomers_sql.show()

**3) What are the top 10 revenue generating products?**

In [None]:
#sum(subtotal) with joins on products, orders,order_items

data = orders.join(order_items,orders.order_id==order_items.oi_order_id).join(products,products.product_id==order_items.oi_product_id)
data.show()

In [None]:
data.printSchema()

In [None]:
import pyspark.sql.functions as f
topNProducts = data.groupBy('product_name').agg(f.round(f.sum('oi_subtotal')).alias('Total Amount')).orderBy('Total Amount',ascending=False)
topNProducts.show(10)

In [None]:
#spark sql 
import pyspark.sql.functions
data.createOrReplaceTempView('topProducts')
topNProducts = spark.sql('select data.product_name,round(sum(data.oi_subtotal),2) Total_Amount from data group by data.product_name order by round(sum(data.oi_subtotal),2) desc limit 10')
topNProducts.show()                         

**4) Top 5 revenue generating deparments**

In [None]:
#joins between 4 tables order_items,departments,products,categories

data = departments.join(categories,departments.department_id == categories.category_dept_id).join(products,categories.category_id==products.prod_category_id).join(order_items,products.product_id==order_items.oi_product_id)
data.show()

In [None]:
 data.printSchema()

In [None]:
import pyspark.sql.functions as f
topDepts = data.groupBy('department_name').agg(f.round(f.sum('oi_subtotal'),2).alias('Total Amount')).orderBy('Total Amount',ascending=False)
topDepts.show()

**5) Top 5 revenue generating cities (from address of Customers)**

In [None]:
#joins between customers,orders,order_items

data = customers.join(orders,customers.customer_id==orders.order_customer_id).join(order_items,order_items.oi_order_id==orders.order_id)
data.show()

In [None]:
data.printSchema()

In [None]:
import pyspark.sql.functions as f
top5Cities = data.groupBy('customer_city').agg(f.round(f.sum('oi_subtotal'),2).alias('Total Amount')).orderBy('Total Amount',ascending=False).limit(5)
top5Cities.show()

In [None]:
#spark sql 
import pyspark.sql.functions
data.createOrReplaceTempView('cities')
top5Cities = spark.sql('select customer_city, round(sum(oi_subtotal),2) Total_Amount from cities group by customer_city order by Total_Amount desc limit 5')
top5Cities.show()