Cache

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Cache").getOrCreate()

In [2]:
data = [
    ("ORD001","Delhi","Laptop",45000),
    ("ORD002","Mumbai","Mobile",32000),
    ("ORD003","Bangalore","Laptop",52000),
    ("ORD004","Delhi","Tablet",28000),
    ("ORD005","Mumbai","Laptop",61000),
    ("ORD006","Chennai","Mobile",30000),
    ("ORD007","Delhi","Laptop",47000),
    ("ORD008","Bangalore","Tablet",35000),
    ("ORD009","Mumbai","Laptop",58000),
    ("ORD010","Delhi","Mobile",29000)
]

columns = ["order_id","city","product","amount"]

df = spark.createDataFrame(data, columns)

In [3]:
high_value_df = df.filter(df.amount > 30000)

In [4]:
high_value_df.count()
high_value_df.groupBy("city").sum("amount").show()
high_value_df.groupBy("product").avg("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      87000|
|   Mumbai|     151000|
|    Delhi|      92000|
+---------+-----------+

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
| Laptop|    52600.0|
| Mobile|    32000.0|
| Tablet|    35000.0|
+-------+-----------+



In [6]:
high_value_df.cache()

high_value_df.count()
high_value_df.groupBy("city").sum("amount").show()
high_value_df.groupBy("product").avg("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      87000|
|   Mumbai|     151000|
|    Delhi|      92000|
+---------+-----------+

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
| Laptop|    52600.0|
| Mobile|    32000.0|
| Tablet|    35000.0|
+-------+-----------+



In [7]:
high_value_df.unpersist()

DataFrame[order_id: string, city: string, product: string, amount: bigint]

In [10]:
df = df.filter(df.amount > 30000)

AttributeError: 'NoneType' object has no attribute 'filter'

In [9]:
df.count()

AttributeError: 'NoneType' object has no attribute 'count'

exercise - 1

In [1]:

# PySpark setup (once)
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DateType
)

spark = SparkSession.builder.appName("RetailChainCapstone").getOrCreate()

# Provided raw data (simulating CSV/JSON ingestion)
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]

customer_data = [
    ("C001","Delhi","Premium"),
    ("C002","Mumbai","Standard"),
    ("C003","Bangalore","Premium"),
    ("C004","Chennai","Standard"),
    ("C005","Mumbai","Premium")
]

city_lookup = [
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1"),
    ("Chennai","Tier-2")
]


PHASE 1 — DATA INGESTION & SCHEMA MANAGEMENT

In [2]:

sales_schema = StructType([
    StructField("txn_id",    StringType(), True),
    StructField("city",      StringType(), True),
    StructField("product",   StringType(), True),
    StructField("category",  StringType(), True),
    # ingest amount as string to gracefully handle invalid values
    StructField("amount",    StringType(), True),
    # ingest txn_date as string (multiple formats will be parsed later)
    StructField("txn_date",  StringType(), True),
    StructField("status",    StringType(), True),
])

customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("city",        StringType(), True),
    StructField("segment",     StringType(), True),
])

city_schema = StructType([
    StructField("city", StringType(), True),
    StructField("tier", StringType(), True),
])


Load raw data into DataFrames

In [3]:

sales_df_raw = spark.createDataFrame(sales_data, schema=sales_schema)
customer_df_raw = spark.createDataFrame(customer_data, schema=customer_schema)
city_df = spark.createDataFrame(city_lookup, schema=city_schema)

# If reading from files instead:
# sales_df_raw = spark.read.schema(sales_schema).option("mode", "PERMISSIVE").csv("/path/sales.csv")
# customer_df_raw = spark.read.schema(customer_schema).json("/path/customer.json")
# city_df = spark.read.schema(city_schema).csv("/path/city_lookup.csv")


 Handle incorrect data types gracefully

In [4]:

# Example safe amount parse (digits only)
sales_df_typed = sales_df_raw.withColumn(
    "amount_int",
    F.when(F.col("amount").isNull(), None)
     .otherwise(
        F.when(F.length(F.regexp_replace(F.col("amount"), r"[^0-9]", "")) > 0,
               F.regexp_replace(F.col("amount"), r"[^0-9]", "").cast(IntegerType()))
         .otherwise(None)
     )
)


Identify corrupt and invalid records

PHASE 2 — DATA CLEANING & TRANSFORMATION
5. Trim and normalize string columns

In [1]:
parsed_timestamp = F.coalesce(
    F.try_to_timestamp(F.col("txn_date"), "yyyy-MM-dd"),
    F.try_to_timestamp(F.col("txn_date"), "dd/MM/yyyy"),
    F.try_to_timestamp(F.col("txn_date"), "yyyy/MM/dd"),
    F.try_to_timestamp(F.col("txn_date"), "dd-MM-yyyy")
)
parsed_date = parsed_timestamp.cast(DateType())

sales_df_flagged = sales_df_typed.withColumn("event_date", parsed_date).withColumn(
    "is_invalid_amount", F.col("amount_int").isNull()
).withColumn(
    "is_invalid_date", F.col("event_date").isNull()
)

invalid_records_df = sales_df_flagged.filter(F.col("is_invalid_amount") | F.col("is_invalid_date"))
valid_records_df   = sales_df_flagged.filter(~(F.col("is_invalid_amount") | F.col("is_invalid_date")))

NameError: name 'F' is not defined

In [7]:

cleaned_df = sales_df_flagged.select(
    F.trim(F.col("txn_id")).alias("txn_id"),
    F.trim(F.col("city")).alias("city"),
    F.trim(F.col("product")).alias("product"),
    F.trim(F.col("category")).alias("category"),
    "amount",
    "amount_int",
    "txn_date",
    "event_date",
    "status",
    "is_invalid_amount",
    "is_invalid_date"
)

Convert category to uppercase

In [8]:

cleaned_df = cleaned_df.withColumn(
    "category", F.upper(F.regexp_replace(F.col("category"), r"\s+", ""))  # remove inner spaces then uppercase
)


8. Handle invalid and null amounts

In [9]:

# Keep rows but mark; we'll filter nulls for analytics
cleaned_df = cleaned_df.withColumn("amount_valid", F.col("amount_int").isNotNull())


9. Parse multiple date formats into DateType

In [10]:
cleaned_df = cleaned_df.withColumn("event_date", F.col("event_date").cast(DateType()))

10. Remove duplicate transactions

In [11]:

# Assume txn_id is unique; keep first occurrence
dedup_df = cleaned_df.dropDuplicates(["txn_id"])


11. Keep only Completed transactions

In [12]:

final_sales_df = dedup_df.filter(F.upper(F.col("status")) == "COMPLETED")
# Also ensure amount is valid and positive
final_sales_df = final_sales_df.filter(F.col("amount_int").isNotNull() & (F.col("amount_int") > 0))


PHASE 3 — DATA ENRICHMENT & JOINS
12. Join sales data with city lookup

In [13]:

# Prepare lookup (trim city)
city_df_clean = city_df.select(F.trim(F.col("city")).alias("city"), F.col("tier"))
sales_enriched_df = final_sales_df.join(city_df_clean, on="city", how="left")


13. Use broadcast join where appropriate

In [14]:

from pyspark.sql.functions import broadcast
sales_enriched_df = final_sales_df.join(broadcast(city_df_clean), on="city", how="left")


14. Explain join strategy used

In [15]:
sales_enriched_df.explain(True)  # Expect BroadcastHashJoin since city_df is small and broadcasted

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Filter (isnotnull(amount_int#12) AND (amount_int#12 > 0))
:  +- Filter (upper(status#6) = COMPLETED)
:     +- Deduplicate [txn_id#16]
:        +- Project [txn_id#16, city#17, product#18, category#20, amount#4, amount_int#12, txn_date#5, cast(event_date#13 as date) AS event_date#22, status#6, is_invalid_amount#14, is_invalid_date#15, amount_valid#21]
:           +- Project [txn_id#16, city#17, product#18, category#20, amount#4, amount_int#12, txn_date#5, event_date#13, status#6, is_invalid_amount#14, is_invalid_date#15, isnotnull(amount_int#12) AS amount_valid#21]
:              +- Project [txn_id#16, city#17, product#18, upper(regexp_replace(category#19, \s+, , 1)) AS category#20, amount#4, amount_int#12, txn_date#5, event_date#13, status#6, is_invalid_amount#14, is_invalid_date#15]
:                 +- Project [trim(txn_id#0, None) AS txn_id#16, trim(city#1, None) AS city#17, trim(product#2, None) AS product#18, trim(cate

PHASE 4 — ANALYTICS & WINDOW FUNCTIONS
16. Revenue per city

In [16]:

revenue_by_city_df = sales_enriched_df.groupBy("city").agg(
    F.sum("amount_int").alias("total_revenue")
)


17. Revenue per product

In [17]:

revenue_by_product_df = sales_enriched_df.groupBy("product").agg(
    F.sum("amount_int").alias("total_revenue")
)


18. Rank cities by total revenue

In [18]:

w_city_rank = Window.orderBy(F.desc("total_revenue"))
city_rank_df = revenue_by_city_df.withColumn("city_rank", F.dense_rank().over(w_city_rank))



19. Rank products within each city

In [19]:

revenue_city_product_df = sales_enriched_df.groupBy("city", "product").agg(
    F.sum("amount_int").alias("product_revenue")
)
w_product_in_city = Window.partitionBy("city").orderBy(F.desc("product_revenue"))
product_rank_in_city_df = revenue_city_product_df.withColumn(
    "product_rank_in_city", F.dense_rank().over(w_product_in_city)
)


20. Identify top-performing city per day



In [21]:

daily_city_revenue_df = sales_enriched_df.groupBy("event_date", "city").agg(
    F.sum("amount_int").alias("daily_revenue")
)
w_top_city_per_day = Window.partitionBy("event_date").orderBy(F.desc("daily_revenue"))
top_city_per_day_df = daily_city_revenue_df.withColumn(
    "rn", F.row_number().over(w_top_city_per_day)
).filter(F.col("rn") == 1).drop("rn")



21. Identify reusable DataFrames

In [22]:
#22. Apply caching appropriately


final_sales_df.cache()
sales_enriched_df.cache()
# Materialize cache
final_sales_df.count(); sales_enriched_df.count()



DateTimeException: [CANNOT_PARSE_TIMESTAMP] Text '05/01/2024' could not be parsed at index 0. Use `try_to_timestamp` to tolerate invalid input string and return NULL instead. SQLSTATE: 22007