In [1]:
import findspark

In [2]:
findspark.init()

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, BooleanType, DateType

In [4]:
spark = SparkSession.builder.appName('p').getOrCreate()
spark

In [None]:
# Load the data: Assume the dataset is in CSV format. Load the data into a PySpark DataFrame.

In [8]:
df = spark.read.csv('Book1.csv', header=True, inferSchema=True)
df.show()

+--------+-----------+----------+--------+-----+-------------+
|order_id|customer_id|product_id|quantity|price|purchase_date|
+--------+-----------+----------+--------+-----+-------------+
|       1|        101|      P001|       2|   25|   01-01-2023|
|       2|        102|      P003|       1|   15|   02-01-2023|
|       3|        101|      P002|       5|   10|   03-01-2023|
|       4|        103|      P001|       1|   25|   05-01-2023|
|       5|        102|      P002|       3|   10|   06-01-2023|
+--------+-----------+----------+--------+-----+-------------+



In [9]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- purchase_date: string (nullable = true)



In [29]:
df.show()

+--------+-----------+----------+--------+-----+-------------+
|order_id|customer_id|product_id|quantity|price|purchase_date|
+--------+-----------+----------+--------+-----+-------------+
|       1|        101|      P001|       2|   25|   01-01-2023|
|       2|        102|      P003|       1|   15|   02-01-2023|
|       3|        101|      P002|       5|   10|   03-01-2023|
|       4|        103|      P001|       1|   25|   05-01-2023|
|       5|        102|      P002|       3|   10|   06-01-2023|
+--------+-----------+----------+--------+-----+-------------+



In [12]:
# Convert the purchase_date column to a DateType.
# using select 
#df.select(col('purchase_date').cast('int').alias('purchase_date'))

df_dt = df.withColumn('purchase_date', df.purchase_date.cast(DateType()))
df_dt.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- purchase_date: date (nullable = true)



In [14]:
#Calculate a new column total_cost, which is the product of quantity and price.
df_total= df.withColumn('total_cost', col('quantity')*col('price'))
df_total.show()

+--------+-----------+----------+--------+-----+-------------+----------+
|order_id|customer_id|product_id|quantity|price|purchase_date|total_cost|
+--------+-----------+----------+--------+-----+-------------+----------+
|       1|        101|      P001|       2|   25|   01-01-2023|        50|
|       2|        102|      P003|       1|   15|   02-01-2023|        15|
|       3|        101|      P002|       5|   10|   03-01-2023|        50|
|       4|        103|      P001|       1|   25|   05-01-2023|        25|
|       5|        102|      P002|       3|   10|   06-01-2023|        30|
+--------+-----------+----------+--------+-----+-------------+----------+



In [41]:
df_total.show()

+--------+-----------+----------+--------+-----+-------------+----------+
|order_id|customer_id|product_id|quantity|price|purchase_date|total_cost|
+--------+-----------+----------+--------+-----+-------------+----------+
|       1|        101|      P001|       2|   25|   01-01-2023|        50|
|       2|        102|      P003|       1|   15|   02-01-2023|        15|
|       3|        101|      P002|       5|   10|   03-01-2023|        50|
|       4|        103|      P001|       1|   25|   05-01-2023|        25|
|       5|        102|      P002|       3|   10|   06-01-2023|        30|
+--------+-----------+----------+--------+-----+-------------+----------+



In [15]:
#Find the total amount spent (total_cost) by each customer_id. 

df_total.groupBy('customer_id').agg(sum('total_cost')).show()

+-----------+---------------+
|customer_id|sum(total_cost)|
+-----------+---------------+
|        101|            100|
|        103|             25|
|        102|             45|
+-----------+---------------+



In [19]:
#Find the total number of orders made by each customer_id.

df_total.groupBy('customer_id').agg(count('product_id')).show() 

+-----------+-----------------+
|customer_id|count(product_id)|
+-----------+-----------------+
|        101|                2|
|        103|                1|
|        102|                2|
+-----------+-----------------+



In [43]:
df_total.show()

+--------+-----------+----------+--------+-----+-------------+----------+
|order_id|customer_id|product_id|quantity|price|purchase_date|total_cost|
+--------+-----------+----------+--------+-----+-------------+----------+
|       1|        101|      P001|       2|   25|   01-01-2023|        50|
|       2|        102|      P003|       1|   15|   02-01-2023|        15|
|       3|        101|      P002|       5|   10|   03-01-2023|        50|
|       4|        103|      P001|       1|   25|   05-01-2023|        25|
|       5|        102|      P002|       3|   10|   06-01-2023|        30|
+--------+-----------+----------+--------+-----+-------------+----------+



In [45]:
#Filter the results to only include customers who have spent more than $50 in total.

df_total.filter(col('total_cost') >= 50).show()

+--------+-----------+----------+--------+-----+-------------+----------+
|order_id|customer_id|product_id|quantity|price|purchase_date|total_cost|
+--------+-----------+----------+--------+-----+-------------+----------+
|       1|        101|      P001|       2|   25|   01-01-2023|        50|
|       3|        101|      P002|       5|   10|   03-01-2023|        50|
+--------+-----------+----------+--------+-----+-------------+----------+



In [21]:
#Sort the results by total_cost in descending order.

df_total.sort(col('total_cost').desc()).show()

+--------+-----------+----------+--------+-----+-------------+----------+
|order_id|customer_id|product_id|quantity|price|purchase_date|total_cost|
+--------+-----------+----------+--------+-----+-------------+----------+
|       1|        101|      P001|       2|   25|   01-01-2023|        50|
|       3|        101|      P002|       5|   10|   03-01-2023|        50|
|       5|        102|      P002|       3|   10|   06-01-2023|        30|
|       4|        103|      P001|       1|   25|   05-01-2023|        25|
|       2|        102|      P003|       1|   15|   02-01-2023|        15|
+--------+-----------+----------+--------+-----+-------------+----------+



In [None]:
Output the Result:

In [52]:
dfts = df_total.groupBy('customer_id').agg(sum('total_cost'))
dfts.show()

+-----------+---------------+
|customer_id|sum(total_cost)|
+-----------+---------------+
|        101|            100|
|        103|             25|
|        102|             45|
+-----------+---------------+



In [57]:
dfts=df_total.groupBy('customer_id').agg(sum('total_cost'))
dfts.show()

+-----------+---------------+
|customer_id|sum(total_cost)|
+-----------+---------------+
|        101|            100|
|        103|             25|
|        102|             45|
+-----------+---------------+



In [58]:
dft = df_total.select('customer_id', 'total_cost')
dft.show()

+-----------+----------+
|customer_id|total_cost|
+-----------+----------+
|        101|        50|
|        102|        15|
|        101|        50|
|        103|        25|
|        102|        30|
+-----------+----------+



In [64]:
dfts.join(dft, dfts.customer_id==dft.customer_id, 'inner').show()

+-----------+---------------+-----------+----------+
|customer_id|sum(total_cost)|customer_id|total_cost|
+-----------+---------------+-----------+----------+
|        101|            100|        101|        50|
|        101|            100|        101|        50|
|        103|             25|        103|        25|
|        102|             45|        102|        30|
|        102|             45|        102|        15|
+-----------+---------------+-----------+----------+



In [23]:
# Print the final result, which should include customer_id, total_spent, and total_orders.

#dfts = df_total.groupBy('customer_id').agg(count('product_id').alias('totoal_order')).select('customer_id','total_cost' )
#dfts.show()

In [29]:
data = [
    (1, 101, "2023-07-01", "A", 2, 10),
    (2, 102, "2023-07-01", "B", 3, 15),
    (3, 101, "2023-07-02", "A", 1, 10),
    (4, 103, "2023-07-02", "C", 2, 20),
    (5, 102, "2023-07-03", "A", 1, 10)
]

columns = ["order_id", "customer_id", "order_date", "product_id", "quantity", "price"]
df = spark.createDataFrame(data, columns)
df.show()

+--------+-----------+----------+----------+--------+-----+
|order_id|customer_id|order_date|product_id|quantity|price|
+--------+-----------+----------+----------+--------+-----+
|       1|        101|2023-07-01|         A|       2|   10|
|       2|        102|2023-07-01|         B|       3|   15|
|       3|        101|2023-07-02|         A|       1|   10|
|       4|        103|2023-07-02|         C|       2|   20|
|       5|        102|2023-07-03|         A|       1|   10|
+--------+-----------+----------+----------+--------+-----+



In [33]:
# Task 2: Calculate total revenue for each order

df = df.withColumn('total_rev', col('quantity')*col('price'))
df.show()

+--------+-----------+----------+----------+--------+-----+---------+
|order_id|customer_id|order_date|product_id|quantity|price|total_rev|
+--------+-----------+----------+----------+--------+-----+---------+
|       1|        101|2023-07-01|         A|       2|   10|       20|
|       2|        102|2023-07-01|         B|       3|   15|       45|
|       3|        101|2023-07-02|         A|       1|   10|       10|
|       4|        103|2023-07-02|         C|       2|   20|       40|
|       5|        102|2023-07-03|         A|       1|   10|       10|
+--------+-----------+----------+----------+--------+-----+---------+



In [37]:
# Task 3: Top-selling products
df2 = df.groupBy('product_id').agg(sum('total_rev').alias('total_quantity_sold'))
df2.show()

df2.orderBy(col('total_quantity_sold').desc()).limit(3).show()

+----------+-------------------+
|product_id|total_quantity_sold|
+----------+-------------------+
|         A|                 40|
|         B|                 45|
|         C|                 40|
+----------+-------------------+

+----------+-------------------+
|product_id|total_quantity_sold|
+----------+-------------------+
|         B|                 45|
|         A|                 40|
|         C|                 40|
+----------+-------------------+



In [38]:
df.show()

+--------+-----------+----------+----------+--------+-----+---------+
|order_id|customer_id|order_date|product_id|quantity|price|total_rev|
+--------+-----------+----------+----------+--------+-----+---------+
|       1|        101|2023-07-01|         A|       2|   10|       20|
|       2|        102|2023-07-01|         B|       3|   15|       45|
|       3|        101|2023-07-02|         A|       1|   10|       10|
|       4|        103|2023-07-02|         C|       2|   20|       40|
|       5|        102|2023-07-03|         A|       1|   10|       10|
+--------+-----------+----------+----------+--------+-----+---------+



In [46]:
# Task 4: Calculate average quantity and price per order 

df.groupBy('order_id').agg(avg('quantity').alias('avg_quantity'), avg('price').alias('avg_price')).show()


+--------+------------+---------+
|order_id|avg_quantity|avg_price|
+--------+------------+---------+
|       1|         2.0|     10.0|
|       2|         3.0|     15.0|
|       3|         1.0|     10.0|
|       4|         2.0|     20.0|
|       5|         1.0|     10.0|
+--------+------------+---------+



In [47]:
# Task 5: Total revenue per customer

df.groupBy('product_id').agg(sum('total_rev').alias('total_revenue')).show()

+----------+-------------+
|product_id|total_revenue|
+----------+-------------+
|         A|           40|
|         B|           45|
|         C|           40|
+----------+-------------+



In [48]:
# Task 6: Date with highest total revenue

highest_revenue_date = df.groupBy("order_date").agg(sum("total_rev").alias("total_revenue"))
highest_revenue_date = highest_revenue_date.orderBy(col("total_revenue").desc()).limit(1)
highest_revenue_date.show()

+----------+-------------+
|order_date|total_revenue|
+----------+-------------+
|2023-07-01|           65|
+----------+-------------+

