In [1]:
import findspark
findspark.init()

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, desc, count, expr, split, explode, lower, regexp_extract
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder.appName('i2').getOrCreate()
spark

### Write PySpark code to perform various tasks on an e-commerce transactions DataFrame, such as calculating total revenue, finding top customers, etc.


In [13]:
from datetime import datetime
data = [
 ("T1", "C1", "P1", 2, 50.0, datetime.strptime("2024-06-01 10:00:00", '%Y-%m-%d %H:%M:%S')),
 ("T2", "C2", "P2", 1, 30.0, datetime.strptime("2024-06-02 11:00:00", '%Y-%m-%d %H:%M:%S')),
 ("T3", "C1", "P3", 3, 20.0, datetime.strptime("2024-06-03 12:00:00", '%Y-%m-%d %H:%M:%S')),
 ("T4", "C3", "P1", 1, 50.0, datetime.strptime("2024-06-04 13:00:00", '%Y-%m-%d %H:%M:%S')),
 ("T5", "C2", "P2", 2, 30.0, datetime.strptime("2024-06-05 14:00:00", '%Y-%m-%d %H:%M:%S'))]

columns = ["transaction_id", "customer_id", "product_id", "quantity", "price", "transaction_date"]
transactions = spark.createDataFrame(data, columns)
transactions.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- quantity: long (nullable = true)
 |-- price: double (nullable = true)
 |-- transaction_date: timestamp (nullable = true)



In [16]:
transactions.show()

+--------------+-----------+----------+--------+-----+-------------------+
|transaction_id|customer_id|product_id|quantity|price|   transaction_date|
+--------------+-----------+----------+--------+-----+-------------------+
|            T1|         C1|        P1|       2| 50.0|2024-06-01 10:00:00|
|            T2|         C2|        P2|       1| 30.0|2024-06-02 11:00:00|
|            T3|         C1|        P3|       3| 20.0|2024-06-03 12:00:00|
|            T4|         C3|        P1|       1| 50.0|2024-06-04 13:00:00|
|            T5|         C2|        P2|       2| 30.0|2024-06-05 14:00:00|
+--------------+-----------+----------+--------+-----+-------------------+



In [15]:
# Convert transaction_date to timestamp

In [21]:
# 1. Calculate Total Revenue 

transactions = transactions.withColumn('total_rev', col('quantity')*col('price'))
transactions.show()

+--------------+-----------+----------+--------+-----+-------------------+---------+
|transaction_id|customer_id|product_id|quantity|price|   transaction_date|total_rev|
+--------------+-----------+----------+--------+-----+-------------------+---------+
|            T1|         C1|        P1|       2| 50.0|2024-06-01 10:00:00|    100.0|
|            T2|         C2|        P2|       1| 30.0|2024-06-02 11:00:00|     30.0|
|            T3|         C1|        P3|       3| 20.0|2024-06-03 12:00:00|     60.0|
|            T4|         C3|        P1|       1| 50.0|2024-06-04 13:00:00|     50.0|
|            T5|         C2|        P2|       2| 30.0|2024-06-05 14:00:00|     60.0|
+--------------+-----------+----------+--------+-----+-------------------+---------+



In [34]:
total_revenue = transactions.agg(sum("total_rev").alias('total_revenue')).collect()[0]['total_revenue']
print(f"Total Revenue: {total_revenue}")

Total Revenue: 300.0


In [37]:
# 2. Find Top Customers by Revenue

top_customers = transactions.groupBy('customer_id').agg(sum('total_rev').alias('tr')).orderBy(desc('tr'))
top_customers.show()

+-----------+-----+
|customer_id|   tr|
+-----------+-----+
|         C1|160.0|
|         C2| 90.0|
|         C3| 50.0|
+-----------+-----+



In [38]:
# 3. Count Transactions per Day

transactions_per_day = transactions.groupBy('transaction_date').agg(count('total_rev')).orderBy('transaction_date')
transactions_per_day.show()

+-------------------+----------------+
|   transaction_date|count(total_rev)|
+-------------------+----------------+
|2024-06-01 10:00:00|               1|
|2024-06-02 11:00:00|               1|
|2024-06-03 12:00:00|               1|
|2024-06-04 13:00:00|               1|
|2024-06-05 14:00:00|               1|
+-------------------+----------------+



In [40]:
# 4. Find the Most Popular Products 

popular_products = transactions.groupBy('product_id').agg(sum('quantity').alias('total_quantity')).orderBy(desc('total_quantity'))
popular_products.show()

+----------+--------------+
|product_id|total_quantity|
+----------+--------------+
|        P1|             3|
|        P2|             3|
|        P3|             3|
+----------+--------------+



In [41]:
# 5. Calculate Average Order Value (AOV)
average_order_value = transactions.groupBy("transaction_id") \
 .agg(sum("total_rev").alias("order_value")) \
 .agg(expr("avg(order_value)").alias("average_order_value")).collect()[0]["average_order_value"]

print(f"Average Order Value: {average_order_value}")


Average Order Value: 60.0


https://sharmashorya1996.medium.com/pyspark-interview-preparation-part-3-coding-practice-a9d66c421c4

### Given a complete E-book of Pride and Prejudice, count the frequency of each word and return top 10 words.

In [8]:
df = spark.read.text('pride_and_prejudice.txt')
df.cache()
df.show(truncate=False)

+--------------------------------------------------------------------+
|value                                                               |
+--------------------------------------------------------------------+
|The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen  |
|                                                                    |
|This eBook is for the use of anyone anywhere at no cost and with    |
|almost no restrictions whatsoever.  You may copy it, give it away or|
|re-use it under the terms of the Project Gutenberg License included |
|with this eBook or online at www.gutenberg.org                      |
|                                                                    |
|                                                                    |
|Title: Pride and Prejudice                                          |
|                                                                    |
|Author: Jane Austen                                                 |
|     

In [16]:
# Tokenize  
df1 = df.select(split(col('value'), ' ').alias('line'))
df1.show(6, truncate=False)

+----------------------------------------------------------------------------------+
|line                                                                              |
+----------------------------------------------------------------------------------+
|[The, Project, Gutenberg, EBook, of, Pride, and, Prejudice,, by, Jane, Austen]    |
|[]                                                                                |
|[This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]   |
|[almost, no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or]|
|[re-use, it, under, the, terms, of, the, Project, Gutenberg, License, included]   |
|[with, this, eBook, or, online, at, www.gutenberg.org]                            |
+----------------------------------------------------------------------------------+
only showing top 6 rows



In [19]:
# Exploding a column of arrays into rows of elements

df2 = df1.select(explode(col('line')).alias('word'))
df2.show(6, truncate=False)

+---------+
|word     |
+---------+
|The      |
|Project  |
|Gutenberg|
|EBook    |
|of       |
|Pride    |
+---------+
only showing top 6 rows



In [25]:
# Cleaning : lowering and removing punctuations

df3 = df2.select(lower(col('word')).alias('word_lower'))
df3.show(truncate=False)

+----------+
|word_lower|
+----------+
|the       |
|project   |
|gutenberg |
|ebook     |
|of        |
|pride     |
|and       |
|prejudice,|
|by        |
|jane      |
|austen    |
|          |
|this      |
|ebook     |
|is        |
|for       |
|the       |
|use       |
|of        |
|anyone    |
+----------+
only showing top 20 rows



In [24]:
df4 = df3.select(regexp_extract(col('word_lower'), '[a-z]+', 0).alias('word'))
df4.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



In [27]:
# Filtering and counting

df5 =df4.filter(col('word') != '')
df6 = df5.groupBy(col('word')).count()
df6.show() 

+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|         some|  209|
|        still|   72|
|          few|   72|
|         hope|  122|
|        those|   60|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
+-------------+-----+
only showing top 20 rows



In [28]:
# Ordering result 
df6.orderBy('count', ascending=False).show(10)

+----+-----+
|word|count|
+----+-----+
| the| 4496|
|  to| 4235|
|  of| 3719|
| and| 3602|
| her| 2223|
|   i| 2052|
|   a| 1997|
|  in| 1920|
| was| 1844|
| she| 1703|
+----+-----+
only showing top 10 rows



In [29]:
spark.stop()