In [4]:
from pyspark.sql import SparkSession

In [5]:
spark=SparkSession.builder.master("local[1]").appName("dataframe_sql").getOrCreate()

In [6]:
spark

In [7]:
#creating dataframe using csv file

In [8]:
order_df=spark.read.format("csv").option("header","true").option("inferSchema","true").load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000.csv")

In [9]:
order_df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [10]:
order_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [11]:
#creating temporary spark table

In [12]:
order_df.createOrReplaceTempView("orders")

In [12]:
# 1. Top 15 customers who placed the most number of orders using dataframe

In [16]:
result=order_df.groupBy("customer_id").count().sort("count",ascending=False).limit(15)

In [17]:
result.show()


+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [18]:
# 1. Top 15 customers who placed the most number of orders using sparksql

In [21]:
result_1_sql=spark.sql("select customer_id,count(order_id) as count from orders group by customer_id order by count desc limit 15")

In [22]:
result_1_sql.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [23]:
# 2.Find the number of orders under each order status using dataframe

In [24]:
result_df_2=order_df.groupBy("order_status").count().sort("count",ascending=False).limit(15)

In [25]:
result_df_2.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8275|
|        PENDING| 7610|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



In [26]:
# 2.Find the number of orders under each order status using sparksql

In [29]:
result_2_sql=spark.sql("select order_status,count(order_id) as count from orders group by order_status order by count desc limit 15")

In [30]:
result_2_sql.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8275|
|        PENDING| 7610|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



In [32]:
# 3.Number of active customers(who placed atleast one order) using dataframe

In [13]:
result_df_3=order_df.select("customer_id").distinct().count()

In [15]:
print(result_df_3)

12405


In [33]:
# 3.Number of active customers(who placed atleast one order) using spark sql

In [21]:
result_ssql_3=spark.sql("select count(distinct(customer_id)) as active_customers from orders")

In [23]:
result_ssql_3.show()

+----------------+
|active_customers|
+----------------+
|           12405|
+----------------+



In [24]:
#4.customers with most number of closed orders using dataframe

In [34]:
result_df_4=order_df.filter("order_status='CLOSED'").groupBy('customer_id').count().sort("count",ascending=False).limit(15)

In [35]:
result_df_4.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7850|    4|
|       3631|    4|
|       2236|    4|
|       1521|    4|
|      10111|    4|
|       4573|    4|
|       2403|    4|
|       2774|    4|
|       7948|    4|
|       2768|    4|
|        437|    4|
+-----------+-----+



In [36]:
#4.customers with most number of closed orders using spark sql

In [41]:
result_ssql_4=spark.sql("select customer_id,count('order_id') as count from orders where order_status='CLOSED' group by customer_id order by count desc limit 15")

In [42]:
result_ssql_4.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7850|    4|
|       3631|    4|
|       2236|    4|
|       1521|    4|
|      10111|    4|
|       4573|    4|
|       2403|    4|
|       2774|    4|
|       7948|    4|
|       2768|    4|
|        437|    4|
+-----------+-----+

