In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("Orders") \
.getOrCreate()

In [4]:
orders_data = [
("O001","Delhi ","Laptop","45000","2024-01-05","Completed"),
("O002","Mumbai","Mobile ","32000","05/01/2024","Completed"),
("O003","Bangalore","Tablet","30000","2024/01/06","Completed"),
("O004","Delhi","Laptop","","2024-01-07","Cancelled"),
("O005","Mumbai","Mobile","invalid","2024-01-08","Completed"),
("O006","Chennai","Tablet",None,"2024-01-08","Completed"),
("O007","Delhi","Laptop","47000","09-01-2024","Completed"),
("O008","Bangalore","Mobile","28000","2024-01-09","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed")
]

columns = ["order_id" ,"city" ,  "product" ,"amount" ,"order_date" ,"status" ]
df = spark.createDataFrame(orders_data, columns)
df.show()

+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|order_date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+



PHASE 1 — DATA INGESTION & SCHEMA

 1: Define Explicit Schema

In [6]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

 2: Create DataFrame Using Schema

In [7]:
df = spark.createDataFrame(data=orders_data, schema=schema)


3.Print schema and validate data types

In [8]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [9]:
df.show(truncate=False)

+--------+---------+-------+-------+----------+---------+
|order_id|city     |product|amount |order_date|status   |
+--------+---------+-------+-------+----------+---------+
|O001    |Delhi    |Laptop |45000  |2024-01-05|Completed|
|O002    |Mumbai   |Mobile |32000  |05/01/2024|Completed|
|O003    |Bangalore|Tablet |30000  |2024/01/06|Completed|
|O004    |Delhi    |Laptop |       |2024-01-07|Cancelled|
|O005    |Mumbai   |Mobile |invalid|2024-01-08|Completed|
|O006    |Chennai  |Tablet |NULL   |2024-01-08|Completed|
|O007    |Delhi    |Laptop |47000  |09-01-2024|Completed|
|O008    |Bangalore|Mobile |28000  |2024-01-09|Completed|
|O009    |Mumbai   |Laptop |55000  |2024-01-10|Completed|
|O009    |Mumbai   |Laptop |55000  |2024-01-10|Completed|
+--------+---------+-------+-------+----------+---------+



PHASE 2 — DATA CLEANING

1. Trim all string columns

In [10]:
from pyspark.sql.functions import col, trim

df = df.withColumn("order_id", trim(col("order_id"))) \
       .withColumn("city", trim(col("city"))) \
       .withColumn("product", trim(col("product"))) \
       .withColumn("amount", trim(col("amount"))) \
       .withColumn("order_date", trim(col("order_date"))) \
       .withColumn("status", trim(col("status")))


2. Standardize city and product values

In [11]:
from pyspark.sql.functions import initcap

df = df.withColumn("city", initcap(col("city"))) \
       .withColumn("product", initcap(col("product")))


3. Convert amount to IntegerType

In [12]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import when


In [13]:
df = df.withColumn(
    "amount_int",
    col("amount").cast(IntegerType())
)

4. Handle invalid and null amount values

In [14]:
df = df.withColumn(
    "amount_int",
    when(col("amount_int").isNull(), 0)
    .otherwise(col("amount_int"))
)


5. Remove duplicate orders

In [15]:
df = df.dropDuplicates(["order_id"])


6. Keep only Completed orders

In [16]:
df = df.filter(col("status") == "Completed")


PHASE 3 — BASIC ANALYTICS

In [19]:
from pyspark.sql.functions import sum, col, when, lit, coalesce
from pyspark.sql.types import IntegerType

# Re-define the column used for aggregation to correctly handle malformed strings.
# This ensures that empty strings and 'invalid' are treated as NULL before casting.
corrected_amount_for_sum = when(col("amount") == "", lit(None)) \
                           .when(col("amount") == "invalid", lit(None)) \
                           .otherwise(col("amount"))

# Now, cast to IntegerType, and then replace any resulting NULLs with 0.
# The initial replacement of '' and 'invalid' with None helps the cast function.
corrected_amount_for_sum = coalesce(corrected_amount_for_sum.cast(IntegerType()), lit(0))

revenue_per_city = df.groupBy("city") \
    .agg(sum(corrected_amount_for_sum).alias("total_revenue"))

revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|  Chennai|            0|
|   Mumbai|        87000|
|    Delhi|        92000|
+---------+-------------+



In [21]:
from pyspark.sql.functions import sum, col, when, lit, coalesce
from pyspark.sql.types import IntegerType

# Re-define the column used for aggregation to correctly handle malformed strings
# by explicitly converting '' and 'invalid' to None before casting.
corrected_amount_for_sum = when(col("amount") == "", lit(None)) \
                           .when(col("amount") == "invalid", lit(None)) \
                           .otherwise(col("amount"))

# Now, cast to IntegerType, and then replace any resulting NULLs with 0 for aggregation.
corrected_amount_for_sum = coalesce(corrected_amount_for_sum.cast(IntegerType()), lit(0))

revenue_per_product = df.groupBy("product") \
    .agg(sum(corrected_amount_for_sum).alias("total_revenue"))

revenue_per_product.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       147000|
| Mobile|        60000|
| Tablet|        30000|
+-------+-------------+



In [23]:
from pyspark.sql.functions import avg, col, when, lit, coalesce
from pyspark.sql.types import IntegerType

# Create a robust numeric column that handles malformed string inputs
corrected_amount_for_avg = when(col("amount") == "", lit(None)) \
                           .when(col("amount") == "invalid", lit(None)) \
                           .otherwise(col("amount"))

# Cast to IntegerType, and then replace any resulting NULLs with 0 for averaging.
# Using 0 is appropriate for average when NULLs should not contribute or indicate a zero value.
corrected_amount_for_avg = coalesce(corrected_amount_for_avg.cast(IntegerType()), lit(0))

avg_order_value_city = df.groupBy("city") \
    .agg(avg(corrected_amount_for_avg).alias("avg_order_value"))

avg_order_value_city.show()

+---------+---------------+
|     city|avg_order_value|
+---------+---------------+
|Bangalore|        29000.0|
|  Chennai|            0.0|
|   Mumbai|        29000.0|
|    Delhi|        46000.0|
+---------+---------------+



PHASE 4 — WINDOW FUNCTION



In [25]:
from pyspark.sql.functions import sum, rank, col, when, lit, coalesce
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

# Create a robust numeric column that handles malformed string inputs
corrected_amount_for_sum = when(col("amount") == "", lit(None)) \
                           .when(col("amount") == "invalid", lit(None)) \
                           .otherwise(col("amount"))

# Cast to IntegerType, and then replace any resulting NULLs with 0 for aggregation.
corrected_amount_for_sum = coalesce(corrected_amount_for_sum.cast(IntegerType()), lit(0))

city_revenue_df = df.groupBy("city") \
    .agg(sum(corrected_amount_for_sum).alias("total_revenue"))

window_spec = Window.orderBy(col("total_revenue").desc())

ranked_cities = city_revenue_df.withColumn(
    "city_rank",
    rank().over(window_spec)
)

ranked_cities.show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        92000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
|  Chennai|            0|        4|
+---------+-------------+---------+



In [26]:
top_city = ranked_cities.filter(col("city_rank") == 1)

top_city.show()

+-----+-------------+---------+
| city|total_revenue|city_rank|
+-----+-------------+---------+
|Delhi|        92000|        1|
+-----+-------------+---------+



PHASE 5 — PERFORMANCE AWARENESS

In [27]:
df.cache()

DataFrame[order_id: string, city: string, product: string, amount: string, order_date: string, status: string, amount_int: int]

In [31]:
from pyspark.sql.functions import sum, col, when, lit, coalesce, avg
from pyspark.sql.types import IntegerType

df.unpersist() # Unpersist to clear any old cached state that might be causing issues.

# Create a robust numeric column that handles malformed string inputs
# and can be used for both sum and average calculations.
corrected_amount_for_agg = when(col("amount") == "", lit(None)) \
                           .when(col("amount") == "invalid", lit(None)) \
                           .otherwise(col("amount"))

# Cast to IntegerType, and then replace any resulting NULLs with 0 for aggregation.
corrected_amount_for_agg = coalesce(corrected_amount_for_agg.cast(IntegerType()), lit(0))

# Aggregation 1
df.groupBy("city") \
  .agg(sum(corrected_amount_for_agg).alias("city_revenue")) \
  .show()

# Aggregation 2
df.groupBy("product") \
  .agg(sum(corrected_amount_for_agg).alias("product_revenue")) \
  .show()

+---------+------------+
|     city|city_revenue|
+---------+------------+
|Bangalore|       58000|
|  Chennai|           0|
|   Mumbai|       87000|
|    Delhi|       92000|
+---------+------------+

+-------+---------------+
|product|product_revenue|
+-------+---------------+
| Laptop|         147000|
| Mobile|          60000|
| Tablet|          30000|
+-------+---------------+



In [30]:
df.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [order_id#50]
   +- Project [order_id#50, city#56, product#57, amount#53, order_date#54, status#55, CASE WHEN isnull(amount_int#58) THEN 0 ELSE amount_int#58 END AS amount_int#59]
      +- Project [order_id#50, city#56, product#57, amount#53, order_date#54, status#55, cast(amount#53 as int) AS amount_int#58]
         +- Project [order_id#50, city#56, initcap(product#52) AS product#57, amount#53, order_date#54, status#55]
            +- Project [order_id#50, initcap(city#51) AS city#56, product#52, amount#53, order_date#54, status#55]
               +- Project [order_id#50, city#51, product#52, amount#53, order_date#54, trim(status#30, None) AS status#55]
                  +- Project [order_id#50, city#51, product#52, amount#53, trim(order_date#29, None) AS order_date#54, status#30]
                     +- Project [order_id#50, city#51, product#52, trim(amount#28, None) AS amount#53, order_date#29, status#30]
    