In [44]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('Process Large Data').getOrCreate()

#PHASE 1 — INGESTION & FIRST INSPECTION

1. Read the CSV file into a DataFrame

In [45]:
l_orders_df = spark.read \
.option("header", "true") \
.csv("/content/orders_large_bad.csv")

2. Disable schema inference and read everything as string

In [46]:
l_orders_df = spark.read \
.option("header", "true") \
.option("inferSchema", "false") \
.csv("/content/orders_large_bad.csv")

3. Print schema and record count

In [47]:
l_orders_df.printSchema()
l_orders_df.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



300000

4. Display 20 random rows

In [48]:
l_orders_df.orderBy(F.rand()).show(20, truncate=False)

+-----------+-----------+---------+-------------+-----------+-------+----------+---------+
|order_id   |customer_id|city     |category     |product    |amount |order_date|status   |
+-----------+-----------+---------+-------------+-----------+-------+----------+---------+
|ORD00070240|C020240    |Pune     |Fashion      |Shoes      |17506  |2024-02-10|Cancelled|
|ORD00227365|C027365    |Bangalore|Electronics  |Mobile     |67343  |2024-01-26|Completed|
|ORD00118028|C018028    |Pune     |Grocery      |Oil        |invalid|2024-01-09|Completed|
|ORD00250409|C000409    |Delhi    |Grocery      |Rice       |39214  |2024-01-30|Completed|
|ORD00226424|C026424    |Pune     | electronics |Mobile     |69208  |14/02/2024|Completed|
|ORD00202205|C002205    |Bangalore|Home         |Mixer      |77703  |2024-01-06|Completed|
|ORD00089341|C039341    |Mumbai   |Home         |AirPurifier|53004  |2024-01-02|Completed|
|ORD00263393|C013393    |Chennai  |Home         |Mixer      |63140  |2024/02/23|Completed|

5. Identify at least 5 data quality issues by observation

In [49]:
#Incorrect Data Types
#amount column non-numeric characters
#Inconsistent order_date formats
#Inconsistent casing/spaces in categorical columns
#Data in wrong column


6. Read the JSON file and compare schema and row count with CSV

In [50]:
orders_json_df = spark.read \
.option("inferSchema", "false") \
.json("/content/orders_large_bad.json")

In [51]:
orders_json_df.printSchema()
l_orders_df.printSchema()

root
 |-- amount: string (nullable = true)
 |-- category: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- status: string (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [52]:
orders_json_df.count()

300000

In [53]:
l_orders_df.count()

300000

#PHASE 2 — SCHEMA ENFORCEMENT & VALIDATION

7. Define an explicit schema using StructType

In [54]:
from pyspark.sql.types import *

l_orders_schema = StructType([
    StructField("order_id", StringType()),
    StructField("customer_id", StringType()),
    StructField("city", StringType()),
    StructField("category", StringType()),
    StructField("product", StringType()),
    StructField("amount", StringType()),
    StructField("order_date", StringType()),
    StructField("status", StringType())
])

8. Re-read the CSV using the defined schema


In [55]:
l_orders_df = spark.read \
.option("header", "true") \
.schema(l_orders_schema) \
.csv("/content/orders_large_bad.csv")

9. Identify rows that fail schema expectations


In [56]:
invalid_schema_df = l_orders_df.filter(
    l_orders_df.order_id.isNull() |
    l_orders_df.amount.isNull()
)
invalid_schema_df.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000029|    C000029|Bangalore|    Grocery|      Sugar|  NULL|2024-01-30|Completed|
|ORD00000058|    C000058|   Mumbai|    Grocery|        Oil|  NULL|2024-02-28|Completed|
|ORD00000087|    C000087|    Delhi|Electronics|     Tablet|  NULL|2024-01-28|Completed|
|ORD00000116|    C000116|Bangalore|    Grocery|      Sugar|  NULL|2024-02-26|Completed|
|ORD00000145|    C000145|    Delhi|       Home|      Mixer|  NULL|2024-01-26|Completed|
|ORD00000174|    C000174|Bangalore|    Grocery|        Oil|  NULL|2024-02-24|Completed|
|ORD00000203|    C000203|Hyderabad|       Home|      Mixer|  NULL|2024-01-24|Completed|
|ORD00000232|    C000232|     Pune|    Fashion|     TShirt|  NULL|2024-02-22|Completed|
|ORD00000261|    C000261|  Kolka

10. Explain why schema inference is dangerous at scale

In [57]:
#Slow: Needs full/large scan.
#Wrong types: Mixed data → errors.
#Sampling fails: Bad sample → bad schema.
#Latency: Adds ingestion delay.
#Corruption: Wrong schema → lost data.
#Unstable: Different runs, different results.
#JSON pain: Nested = unpredictable.

#PHASE 3 — STRING CLEANING & STANDARDIZATION

11. Trim leading and trailing spaces from all string columns


In [58]:
string_cols = l_orders_df.columns

for col in string_cols:
  l_orders_df = l_orders_df.withColumn(col, F.trim(F.col(col)))
  l_orders_df.show(10, truncate=False)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city       |category   |product    |amount |order_date|status   |
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|C000000    | hyderabad | grocery   |Oil        |invalid|01/01/2024|Cancelled|
|ORD00000001|C000001    |Pune       |Grocery    |Sugar      |35430  |2024-01-02|Completed|
|ORD00000002|C000002    |Pune       |Electronics|Mobile     |65358  |2024-01-03|Completed|
|ORD00000003|C000003    |Bangalore  |Electronics|Laptop     |5558   |2024-01-04|Completed|
|ORD00000004|C000004    |Pune       |Home       |AirPurifier|33659  |2024-01-05|Completed|
|ORD00000005|C000005    |Delhi      |Fashion    |Jeans      |8521   |2024-01-06|Completed|
|ORD00000006|C000006    |Delhi      |Grocery    |Sugar      |42383  |2024-01-07|Completed|
|ORD00000007|C000007    |Pune       |Grocery    |Rice       |45362  |2024-01-08|Completed|

12. Standardize city , category , and product values


In [59]:
l_orders_df = l_orders_df \
.withColumn("city", F.upper("city")) \
.withColumn("category", F.upper("category")) \
.withColumn("product", F.upper("product"))

l_orders_df.select("city", "category", "product").show(10, truncate=False)

+---------+-----------+-----------+
|city     |category   |product    |
+---------+-----------+-----------+
|HYDERABAD|GROCERY    |OIL        |
|PUNE     |GROCERY    |SUGAR      |
|PUNE     |ELECTRONICS|MOBILE     |
|BANGALORE|ELECTRONICS|LAPTOP     |
|PUNE     |HOME       |AIRPURIFIER|
|DELHI    |FASHION    |JEANS      |
|DELHI    |GROCERY    |SUGAR      |
|PUNE     |GROCERY    |RICE       |
|BANGALORE|FASHION    |JEANS      |
|KOLKATA  |ELECTRONICS|LAPTOP     |
+---------+-----------+-----------+
only showing top 10 rows


13. Convert all categorical columns to a consistent case


In [60]:
l_orders_df = l_orders_df.withColumn("status", F.upper("status"))
l_orders_df.select("status").distinct().show(truncate=False)

+---------+
|status   |
+---------+
|CANCELLED|
|COMPLETED|
+---------+



14. Identify how many distinct city values existed before vs after cleaning

In [61]:
l_orders_df.select("city").distinct().show(20, truncate=False)

+---------+
|city     |
+---------+
|KOLKATA  |
|BANGALORE|
|DELHI    |
|HYDERABAD|
|CHENNAI  |
|PUNE     |
|MUMBAI   |
+---------+



#PHASE 4 — AMOUNT CLEANING (CRITICAL)

15. Identify invalid values in the amount column


In [62]:
l_orders_df.filter(~F.col("amount").rlike("^[0-9,]+$")).show(20, truncate=False)

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|C000000    |HYDERABAD|GROCERY    |OIL        |invalid|01/01/2024|CANCELLED|
|ORD00000019|C000019    |MUMBAI   |ELECTRONICS|MOBILE     |invalid|2024-01-20|COMPLETED|
|ORD00000038|C000038    |DELHI    |HOME       |VACUUM     |invalid|2024-02-08|COMPLETED|
|ORD00000057|C000057    |KOLKATA  |HOME       |AIRPURIFIER|invalid|2024-02-27|COMPLETED|
|ORD00000076|C000076    |MUMBAI   |HOME       |AIRPURIFIER|invalid|2024-01-17|COMPLETED|
|ORD00000095|C000095    |HYDERABAD|ELECTRONICS|MOBILE     |invalid|2024-02-05|COMPLETED|
|ORD00000114|C000114    |BANGALORE|HOME       |AIRPURIFIER|invalid|2024-02-24|COMPLETED|
|ORD00000133|C000133    |KOLKATA  |HOME       |VACUUM     |invalid|2024-01-14|COMPLETED|
|ORD00000152|C000152 

16. Remove commas from numeric strings


In [63]:
l_orders_df = l_orders_df.withColumn("amount_clean", F.regexp_replace("amount", ",", ""))
l_orders_df.select("amount", "amount_clean").show(20, truncate=False)

+-------+------------+
|amount |amount_clean|
+-------+------------+
|invalid|invalid     |
|35430  |35430       |
|65358  |65358       |
|5558   |5558        |
|33659  |33659       |
|8521   |8521        |
|42383  |42383       |
|45362  |45362       |
|10563  |10563       |
|63715  |63715       |
|66576  |66576       |
|50318  |50318       |
|84768  |84768       |
|79121  |79121       |
|79469  |79469       |
|81018  |81018       |
|64225  |64225       |
|69582  |69582       |
|50424  |50424       |
|invalid|invalid     |
+-------+------------+
only showing top 20 rows


17. Convert amount to IntegerType safely


In [64]:
l_orders_df = l_orders_df.withColumn(
    "amount_int",
    F.when(
        F.col("amount_clean").rlike("^[0-9]+$"),
        F.col("amount_clean").cast("int")
    )
)

l_orders_df.select("amount_clean", "amount_int").show(20, truncate=False)

+------------+----------+
|amount_clean|amount_int|
+------------+----------+
|invalid     |NULL      |
|35430       |35430     |
|65358       |65358     |
|5558        |5558      |
|33659       |33659     |
|8521        |8521      |
|42383       |42383     |
|45362       |45362     |
|10563       |10563     |
|63715       |63715     |
|66576       |66576     |
|50318       |50318     |
|84768       |84768     |
|79121       |79121     |
|79469       |79469     |
|81018       |81018     |
|64225       |64225     |
|69582       |69582     |
|50424       |50424     |
|invalid     |NULL      |
+------------+----------+
only showing top 20 rows


18. Handle empty, null, and invalid values explicitly


In [65]:
l_orders_df.filter(F.col("amount_int").isNull()).show(20, truncate=False)

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+------------+----------+
|order_id   |customer_id|city     |category   |product    |amount |order_date|status   |amount_clean|amount_int|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+------------+----------+
|ORD00000000|C000000    |HYDERABAD|GROCERY    |OIL        |invalid|01/01/2024|CANCELLED|invalid     |NULL      |
|ORD00000019|C000019    |MUMBAI   |ELECTRONICS|MOBILE     |invalid|2024-01-20|COMPLETED|invalid     |NULL      |
|ORD00000029|C000029    |BANGALORE|GROCERY    |SUGAR      |NULL   |2024-01-30|COMPLETED|NULL        |NULL      |
|ORD00000038|C000038    |DELHI    |HOME       |VACUUM     |invalid|2024-02-08|COMPLETED|invalid     |NULL      |
|ORD00000057|C000057    |KOLKATA  |HOME       |AIRPURIFIER|invalid|2024-02-27|COMPLETED|invalid     |NULL      |
|ORD00000058|C000058    |MUMBAI   |GROCERY    |OIL        |NULL   |2024-02-28|COMPLETED|NULL    

19. Count how many records were affected during amount cleaning

In [66]:
l_orders_df.filter(F.col("amount_int").isNull()).count()

25164

#PHASE 5 — DATE PARSING & NORMALIZATION

20. Identify all date formats present in order_date


In [67]:
l_orders_df.select("order_date").distinct().show(20, False)

+----------+
|order_date|
+----------+
|2024-01-19|
|2024/01/02|
|30/01/2024|
|2024-02-08|
|2024-02-28|
|2024-01-13|
|2024/01/14|
|18/01/2024|
|27/01/2024|
|06/01/2024|
|2024-02-20|
|2024-01-06|
|2024/01/09|
|16/02/2024|
|2024-02-04|
|15/01/2024|
|2024-02-15|
|2024/02/23|
|2024-02-12|
|2024/02/25|
+----------+
only showing top 20 rows


21. Parse valid dates into DateType


In [68]:
l_orders_df = l_orders_df.withColumn(
    "order_date_clean",
    F.coalesce(
        F.expr("try_to_timestamp(order_date, 'yyyy-MM-dd')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'yyyy/MM/dd')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'dd/MM/yyyy')").cast("date"),
        F.expr("try_to_timestamp(order_date, 'dd-MM-yyyy')").cast("date")
    )
)
l_orders_df.select("order_date", "order_date_clean").show(20, truncate=False)

+----------+----------------+
|order_date|order_date_clean|
+----------+----------------+
|01/01/2024|2024-01-01      |
|2024-01-02|2024-01-02      |
|2024-01-03|2024-01-03      |
|2024-01-04|2024-01-04      |
|2024-01-05|2024-01-05      |
|2024-01-06|2024-01-06      |
|2024-01-07|2024-01-07      |
|2024-01-08|2024-01-08      |
|2024-01-09|2024-01-09      |
|2024-01-10|2024-01-10      |
|2024-01-11|2024-01-11      |
|12/01/2024|2024-01-12      |
|2024-01-13|2024-01-13      |
|2024/01/14|2024-01-14      |
|2024-01-15|2024-01-15      |
|2024-01-16|2024-01-16      |
|2024-01-17|2024-01-17      |
|2024-01-18|2024-01-18      |
|2024-01-19|2024-01-19      |
|2024-01-20|2024-01-20      |
+----------+----------------+
only showing top 20 rows


22. Handle invalid dates gracefully


In [69]:
l_orders_df.filter(F.col("order_date_clean").isNull()) \
.select("order_date").show(20, truncate=False)

+------------+
|order_date  |
+------------+
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
|invalid_date|
+------------+
only showing top 20 rows


23. Create a clean order_date_clean column


In [70]:
l_orders_df.select("order_id", "order_date", "order_date_clean").show(20, truncate=False)

+-----------+----------+----------------+
|order_id   |order_date|order_date_clean|
+-----------+----------+----------------+
|ORD00000000|01/01/2024|2024-01-01      |
|ORD00000001|2024-01-02|2024-01-02      |
|ORD00000002|2024-01-03|2024-01-03      |
|ORD00000003|2024-01-04|2024-01-04      |
|ORD00000004|2024-01-05|2024-01-05      |
|ORD00000005|2024-01-06|2024-01-06      |
|ORD00000006|2024-01-07|2024-01-07      |
|ORD00000007|2024-01-08|2024-01-08      |
|ORD00000008|2024-01-09|2024-01-09      |
|ORD00000009|2024-01-10|2024-01-10      |
|ORD00000010|2024-01-11|2024-01-11      |
|ORD00000011|12/01/2024|2024-01-12      |
|ORD00000012|2024-01-13|2024-01-13      |
|ORD00000013|2024/01/14|2024-01-14      |
|ORD00000014|2024-01-15|2024-01-15      |
|ORD00000015|2024-01-16|2024-01-16      |
|ORD00000016|2024-01-17|2024-01-17      |
|ORD00000017|2024-01-18|2024-01-18      |
|ORD00000018|2024-01-19|2024-01-19      |
|ORD00000019|2024-01-20|2024-01-20      |
+-----------+----------+----------

24. Count records with invalid dates

In [71]:
l_orders_df.filter(F.col("order_date_clean").isNull()).count()

2595

#PHASE 6 — BUSINESS FILTERING & DEDUPLICATION

25. Identify duplicate order_id values


In [72]:
l_orders_df.groupBy("order_id").count().filter(F.col("count")>1).show(20, truncate=False)

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



26. Remove duplicate orders safely


In [73]:
l_orders_df = l_orders_df.dropDuplicates(["order_id"])
l_orders_df.count()

300000

In [74]:
before_filter_count = l_orders_df.count()
print("Record count before Filtering: ", before_filter_count)

Record count before Filtering:  300000


27. Keep only records with status = Completed


In [78]:
l_orders_df = l_orders_df.filter(F.col("status") == "COMPLETED")
after_filter_count = l_orders_df.count()
l_orders_df.count()

285000

28. Validate record counts before and after filtering

In [79]:
print("Record count before Filtering: ", before_filter_count)

Record count before Filtering:  300000


In [80]:
print("Record count after Filtering: ", after_filter_count)

Record count after Filtering:  285000


#PHASE 7 — PERFORMANCE & PARTITION AWARENESS

29. Check the default number of partitions


In [81]:
l_orders_df.rdd.getNumPartitions()

2

30. Run a heavy groupBy and observe execution time


In [82]:
l_orders_df.groupBy("city").agg(F.sum("amount_int")).show(20, truncate=False)

+---------+---------------+
|city     |sum(amount_int)|
+---------+---------------+
|KOLKATA  |1624300497     |
|BANGALORE|1628527093     |
|DELHI    |1639639916     |
|HYDERABAD|1642443340     |
|CHENNAI  |1629865247     |
|PUNE     |1646196535     |
|MUMBAI   |1625518096     |
+---------+---------------+



31. Use explain(True) to identify shuffle stages


In [83]:
l_orders_df.groupBy("city").agg(F.sum("amount_int")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum('amount_int))]
+- Filter (status#1632 = COMPLETED)
   +- Filter (status#1632 = COMPLETED)
      +- Filter (status#1632 = COMPLETED)
         +- Deduplicate [order_id#1344]
            +- Project [order_id#1344, customer_id#1378, city#1616, category#1617, product#1618, amount#1514, order_date#1548, status#1632, amount_clean#1676, amount_int#1685, coalesce(cast(try_to_timestamp(order_date#1548, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false) as date)) AS order_date_clean#1753]
               +- Project [order_id#1344, customer_id#1378, city#1616, category#1617, product#1618, amount#1514, order_date#15

32. Repartition the DataFrame by city


In [84]:
l_orders_df = l_orders_df.repartition("city")
l_orders_df.rdd.getNumPartitions()

3

33. Compare execution plans before and after repartition

In [85]:
l_orders_df.groupBy("city").agg(F.sum("amount_int")).explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum('amount_int))]
+- RepartitionByExpression [city#1616]
   +- Filter (status#1632 = COMPLETED)
      +- Filter (status#1632 = COMPLETED)
         +- Filter (status#1632 = COMPLETED)
            +- Deduplicate [order_id#1344]
               +- Project [order_id#1344, customer_id#1378, city#1616, category#1617, product#1618, amount#1514, order_date#1548, status#1632, amount_clean#1676, amount_int#1685, coalesce(cast(try_to_timestamp(order_date#1548, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#1548, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false) as date)) AS order_date_clean#1753]
                  +- Project [order_id#1344, customer_id#1378, city#1616

#PHASE 8 — ANALYTICS ON LARGE DATA

34. Calculate total revenue per city


In [86]:
city_revenue_df = l_orders_df.groupBy("city").agg(F.sum("amount_int").alias("total_revenue"))
city_revenue_df.show(20, truncate=False)

+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|KOLKATA  |1624300497   |
|BANGALORE|1628527093   |
|DELHI    |1639639916   |
|HYDERABAD|1642443340   |
|CHENNAI  |1629865247   |
|PUNE     |1646196535   |
|MUMBAI   |1625518096   |
+---------+-------------+



35. Calculate total revenue per category


In [87]:
l_orders_df.groupBy("category").agg(F.sum("amount_int").alias("total_revenue")).show(20, truncate=False)

+-----------+-------------+
|category   |total_revenue|
+-----------+-------------+
|HOME       |2868467576   |
|ELECTRONICS|2867568870   |
|GROCERY    |2866272106   |
|FASHION    |2834182172   |
+-----------+-------------+



36. Calculate total revenue per product


In [88]:
l_orders_df.groupBy("product").agg(F.sum("amount_int").alias("total_revenue")).show(20, truncate=False)

+-----------+-------------+
|product    |total_revenue|
+-----------+-------------+
|MOBILE     |944352576    |
|SHOES      |946799102    |
|MIXER      |957140026    |
|AIRPURIFIER|952178123    |
|JEANS      |951286127    |
|RICE       |954494237    |
|TSHIRT     |936096943    |
|TABLET     |960719999    |
|OIL        |963572869    |
|SUGAR      |948205000    |
|VACUUM     |959149427    |
|LAPTOP     |962496295    |
+-----------+-------------+



37. Identify top 10 products by revenue


In [89]:
l_orders_df.groupBy("product").agg(F.sum("amount_int").alias("total_revenue")) \
.orderBy(F.col("total_revenue").desc()) \
.show(10, truncate=False)

+-----------+-------------+
|product    |total_revenue|
+-----------+-------------+
|OIL        |963572869    |
|LAPTOP     |962496295    |
|TABLET     |960719999    |
|VACUUM     |959149427    |
|MIXER      |957140026    |
|RICE       |954494237    |
|AIRPURIFIER|952178123    |
|JEANS      |951286127    |
|SUGAR      |948205000    |
|SHOES      |946799102    |
+-----------+-------------+
only showing top 10 rows


38. Calculate average order value per city

In [90]:
l_orders_df.groupBy("city").agg(F.avg("amount_int").alias("avg_order_value")).show(20, truncate=False)

+---------+------------------+
|city     |avg_order_value   |
+---------+------------------+
|KOLKATA  |43709.816662630175|
|BANGALORE|44098.867908689645|
|DELHI    |43817.20780331374 |
|HYDERABAD|43708.74045293664 |
|CHENNAI  |43628.27900315863 |
|PUNE     |43930.204013556424|
|MUMBAI   |43723.75651612556 |
+---------+------------------+



#PHASE 9 — WINDOW FUNCTIONS (BIG DATA SAFE)

39. Rank cities by total revenue

In [91]:
from pyspark.sql.window import Window

city_window = Window.orderBy(F.col("total_revenue").desc())
city_revenue_df.withColumn("rank", F.rank().over(city_window)).show(20, truncate=False)

+---------+-------------+----+
|city     |total_revenue|rank|
+---------+-------------+----+
|PUNE     |1646196535   |1   |
|HYDERABAD|1642443340   |2   |
|DELHI    |1639639916   |3   |
|CHENNAI  |1629865247   |4   |
|BANGALORE|1628527093   |5   |
|MUMBAI   |1625518096   |6   |
|KOLKATA  |1624300497   |7   |
+---------+-------------+----+



40. Rank products within each category by revenue


In [92]:
product_revenue_df = l_orders_df.groupBy("category", "product").agg(F.sum("amount_int").alias("revenue"))
category_window = Window.partitionBy("category").orderBy(F.col("revenue").desc())
product_revenue_df.withColumn("rank", F.rank().over(category_window)).show(20, truncate=False)

+-----------+-----------+---------+----+
|category   |product    |revenue  |rank|
+-----------+-----------+---------+----+
|ELECTRONICS|LAPTOP     |962496295|1   |
|ELECTRONICS|TABLET     |960719999|2   |
|ELECTRONICS|MOBILE     |944352576|3   |
|FASHION    |JEANS      |951286127|1   |
|FASHION    |SHOES      |946799102|2   |
|FASHION    |TSHIRT     |936096943|3   |
|GROCERY    |OIL        |963572869|1   |
|GROCERY    |RICE       |954494237|2   |
|GROCERY    |SUGAR      |948205000|3   |
|HOME       |VACUUM     |959149427|1   |
|HOME       |MIXER      |957140026|2   |
|HOME       |AIRPURIFIER|952178123|3   |
+-----------+-----------+---------+----+



41. Identify the top product per category


In [93]:
product_revenue_df.withColumn("rank", F.rank().over(category_window)) \
.filter(F.col("rank") == 1).show(20, truncate=False)

+-----------+-------+---------+----+
|category   |product|revenue  |rank|
+-----------+-------+---------+----+
|ELECTRONICS|LAPTOP |962496295|1   |
|FASHION    |JEANS  |951286127|1   |
|GROCERY    |OIL    |963572869|1   |
|HOME       |VACUUM |959149427|1   |
+-----------+-------+---------+----+



42. Identify top 3 cities using window functions

In [94]:
city_revenue_df.withColumn("rank", F.rank().over(city_window)) \
.filter(F.col("rank") <= 3).show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     PUNE|   1646196535|   1|
|HYDERABAD|   1642443340|   2|
|    DELHI|   1639639916|   3|
+---------+-------------+----+



#PHASE 10 — CACHING & REUSE

43. Identify DataFrames reused multiple times


In [None]:
#l_orders_df is reused in multiple aggregations

44. Apply caching strategically


In [95]:
l_orders_df.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

45. Re-run analytics and observe performance


In [96]:
l_orders_df.groupBy("city").agg(F.sum("amount_int").alias("total_revenue")).show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|  KOLKATA|   1624300497|
|BANGALORE|   1628527093|
|    DELHI|   1639639916|
|HYDERABAD|   1642443340|
|  CHENNAI|   1629865247|
|     PUNE|   1646196535|
|   MUMBAI|   1625518096|
+---------+-------------+



46. Unpersist when cache is no longer needed


In [97]:
l_orders_df.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

47. Explain why over-caching is dangerous

In [98]:
#consumes executor memory
#causes disk spill
#slows other spark jobs
#can crash executions

#PHASE 11 — FILE FORMAT STRATEGY

48. Write the cleaned order-level dataset to Parquet


In [104]:
l_orders_df.write.mode("overwrite").parquet("/content/l_orders_parquet")

49. Partition the Parquet output by city


In [105]:
l_orders_df.write.mode("overwrite").partitionBy("city").parquet("/content/l_orders_parquet_city")

50. Write aggregated analytics to ORC


In [106]:
city_revenue_df.write.mode("overwrite").orc("/content/city_revenue_orc")

51. Read both formats back and validate schema


In [107]:
spark.read.parquet("/content/l_orders_parquet_city").printSchema()
spark.read.orc("/content/city_revenue_orc").printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- city: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)



52. Compare number of output files generated

In [108]:
spark.read.parquet("/content/l_orders_parquet_city").rdd.getNumPartitions()

2

#PHASE 12 — DEBUGGING & FAILURE SCENARIOS

53. Explain why the following line breaks pipelines:

df = df.filter(df.amount > 50000).show()



In [109]:
l_orders_df = l_orders_df.filter(l_orders_df.amount > 50000).show()

{"ts": "2025-12-26 06:03:44.132", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 1 in cell [109]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o2818.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 1 in cell [109]\n\n\tat org.apache.spark.sql.errors.Q

NumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 1 in cell [109]


In [110]:
#show() is an action and returns None
#amount is a string, causing NumberFormatException
#Spark executes lazily - error only appears at .show()

54. Create a scenario that produces a NoneType error


In [111]:
df = l_orders_df.show()
df.count()

+-----------+-----------+-------+-----------+-----------+-------+----------+---------+------------+----------+----------------+
|   order_id|customer_id|   city|   category|    product| amount|order_date|   status|amount_clean|amount_int|order_date_clean|
+-----------+-----------+-------+-----------+-----------+-------+----------+---------+------------+----------+----------------+
|ORD00000011|    C000011|KOLKATA|ELECTRONICS|     TABLET|  50318|12/01/2024|COMPLETED|       50318|     50318|      2024-01-12|
|ORD00000036|    C000036|KOLKATA|    GROCERY|        OIL|  29253|2024-02-06|COMPLETED|       29253|     29253|      2024-02-06|
|ORD00000048|    C000048|KOLKATA|    FASHION|      JEANS|  51000|2024-02-18|COMPLETED|       51000|     51000|      2024-02-18|
|ORD00000054|    C000054|KOLKATA|    GROCERY|        OIL|  26434|2024-02-24|COMPLETED|       26434|     26434|      2024-02-24|
|ORD00000104|    C000104|KOLKATA|    FASHION|      JEANS|  32476|2024/02/14|COMPLETED|       32476|     

AttributeError: 'NoneType' object has no attribute 'count'

55. Identify a transformation that causes a wide shuffle


In [112]:
#groupBy(), join(), distinct(), orderBy()

56. Explain how you would debug a slow Spark job

In [113]:
#explain(True)
#Spark UI
#Check skew
#Optimize partitions
#Broadcast joins

#PHASE 13 — FINAL VALIDATION

57. Validate no nulls in critical columns


In [114]:
l_orders_df.filter(
    F.col("order_id").isNull() |
    F.col("amount_int").isNull() |
    F.col("order_date_clean").isNull()
).count()

26166

58. Confirm correct data types for all columns


In [115]:
l_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)



59. Validate final record count


In [116]:
l_orders_df.count()

285000

60. Document three optimization decisions you made

In [117]:
#Explicit schema enforcement
#Repartition by city
#Strategic caching after data cleaning