In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf

# Access SparkContext
sc = SparkContext.getOrCreate()

# Example: Print executor memory and cores
print("Executor Memory:", sc._conf.get("spark.executor.memory"))
print("Executor Cores:", sc._conf.get("spark.executor.cores"))
print("Executor Instances:", sc._conf.get("spark.executor.instances"))

StatementMeta(, 0ccc8d4b-1f2d-492c-9bea-11265d0fbf7a, 3, Finished, Available, Finished)

Executor Memory: 56g
Executor Cores: 8
Executor Instances: 1


#### Read Ecommerce csv file

In [1]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("Files/csv/FileOptimization/ecommerce_transactions.csv")

# Display the DataFrame
print(df.count())
display(df.limit(5))

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 3, Finished, Available, Finished)

5000000


SynapseWidget(Synapse.DataFrame, 4b09d514-6e2f-456e-ab13-c99506745223)

#### Counting number of transaction by Product Category

In [9]:
from pyspark.sql.functions import col

trans_prod_cat = df.groupBy("ProductCategory") \
    .count() \
    .withColumnRenamed("count", "NbrOfTransaction") \
    .orderBy(col("NbrOfTransaction").desc())

display(trans_prod_cat)

StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 54df5ff8-88e0-4691-ad03-199fcfc133d8)

### Exploring Query Plans in PySpark: A Guide to **.explain()** Modes
```python
df.explain()           # Basic physical plan
df.explain(True)       # Extended plan: logical + physical
df.explain(mode="simple")       # Same as df.explain()
df.explain(mode="extended")     # Same as df.explain(True)
df.explain(mode="codegen")      # Shows generated Java code
df.explain(mode="cost")         # Shows estimated cost (if available)
df.explain(mode="formatted")    # Nicely formatted plan
```

### ‚ö†Ô∏è Downsides of Reading a Single Huge File in Apache Spark
While reading from a single file may seem straightforward, it can introduce significant performance and scalability challenges when the file is large. Here's why:

‚ùå Performance & Scalability Issues
- No Partition Pruning: Spark must scan the entire file, even if only a subset of the data is needed.
- Limited Parallelism: A single large file may not be split efficiently across executors, reducing Spark‚Äôs ability to parallelize the workload.
- High Memory Usage: Large files can overwhelm executor memory, leading to disk spills or job failures.
- Longer Job Duration: Without partitioning, Spark performs more I/O and shuffle operations, increasing execution time.
- Shuffle Bottlenecks: Aggregations and joins on unpartitioned data often require expensive shuffles across the cluster.
- Checkpointing & Recovery Overhead: In case of failure, Spark may need to reprocess the entire file, increasing recovery time and cost.

In [3]:
from pyspark.sql.functions import year

# Add a Year column extracted from Timestamp
df_with_year = df.withColumn("Year", year(df["Timestamp"]))

# Filter and aggregate
df_with_year.filter(df_with_year.ProductCategory == "Electronics") \
    .groupBy("Year") \
    .agg({"Amount": "sum"}) \
    .explain(mode="cost")         # Shows estimated cost (if available)

StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 5, Finished, Available, Finished)

== Optimized Logical Plan ==
Aggregate [Year#833], [Year#833, sum(Amount#746) AS sum(Amount)#849], Statistics(sizeInBytes=49.1 MiB)
+- Project [Amount#746, year(cast(Timestamp#748 as date)) AS Year#833], Statistics(sizeInBytes=49.1 MiB)
   +- Filter (isnotnull(ProductCategory#745) AND (ProductCategory#745 = Electronics)), Statistics(sizeInBytes=245.4 MiB)
      +- Relation [TransactionID#743,UserID#744,ProductCategory#745,Amount#746,Country#747,Timestamp#748] csv, Statistics(sizeInBytes=245.4 MiB)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Year#833], functions=[sum(Amount#746)], output=[Year#833, sum(Amount)#849])
   +- Exchange hashpartitioning(Year#833, 200), ENSURE_REQUIREMENTS, [plan_id=368]
      +- HashAggregate(keys=[Year#833], functions=[partial_sum(Amount#746)], output=[Year#833, sum#854])
         +- Project [Amount#746, year(cast(Timestamp#748 as date)) AS Year#833]
            +- Filter (isnotnull(ProductCategory#745) AND (ProductCategor

### üöÄ Optimizing Apache Spark with File Partitioning
Partitioning is a powerful technique in Apache Spark that improves performance by reducing data shuffling and enabling efficient query execution. Here's how and when to use it:

‚úÖ **Why Use File Partitioning?**
- Improves read performance by skipping irrelevant partitions.
- Reduces shuffle operations during joins and aggregations.
- Enables partition pruning, which minimizes I/O.
- Organizes data for scalable storage and processing.

let us partition the efile

In [4]:
from pyspark.sql.functions import col

csv_file_src = "Files/csv/FileOptimization/ecommerce_transactions.csv"
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_src)

# Write partitioned CSV files by ProductCategory
df.write \
    .format("csv") \
    .option("header", "true") \
    .partitionBy("ProductCategory") \
    .mode("overwrite") \
    .save(csv_file_partition)


StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 6, Finished, Available, Finished)

#### Read the partition csv file

In [6]:
from pyspark.sql.functions import year

# Path to the partitioned CSV folder
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the partitioned data
df_partitioned = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_partition)

# Add Year column from Timestamp
df_with_year = df_partitioned.withColumn("Year", year(df_partitioned["Timestamp"]))

# Filter and aggregate for Electronics
electronics_sales_by_year = df_with_year.filter(df_with_year.ProductCategory == "Electronics") \
    .groupBy("Year") \
    .agg({"Amount": "sum"}) \
    .explain(mode="cost")         # Shows estimated cost (if available)

# Display the result
# display(electronics_sales_by_year)

StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 8, Finished, Available, Finished)

== Optimized Logical Plan ==
Aggregate [Year#979], [Year#979, sum(Amount#969) AS sum(Amount)#996], Statistics(sizeInBytes=40.5 MiB)
+- Project [Amount#969, year(cast(Timestamp#971 as date)) AS Year#979], Statistics(sizeInBytes=40.5 MiB)
   +- Filter (isnotnull(ProductCategory#972) AND (ProductCategory#972 = Electronics)), Statistics(sizeInBytes=202.4 MiB)
      +- Relation [TransactionID#967,UserID#968,Amount#969,Country#970,Timestamp#971,ProductCategory#972] csv, Statistics(sizeInBytes=202.4 MiB)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Year#979], functions=[sum(Amount#969)], output=[Year#979, sum(Amount)#996])
   +- Exchange hashpartitioning(Year#979, 200), ENSURE_REQUIREMENTS, [plan_id=485]
      +- HashAggregate(keys=[Year#979], functions=[partial_sum(Amount#969)], output=[Year#979, sum#1001])
         +- Project [Amount#969, year(cast(Timestamp#971 as date)) AS Year#979]
            +- FileScan csv [Amount#969,Timestamp#971,ProductCategory#97

In [3]:
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.binSize", "128MB")

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 5, Finished, Available, Finished)

In [2]:
spark.conf.get("spark.microsoft.delta.optimize.fast.enabled")

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 4, Finished, Available, Finished)

'false'

In [17]:
spark.conf.set("spark.microsoft.delta.optimize.fast.enabled", True)

StatementMeta(, 08c19f44-fe41-4ea1-987a-2fc5d90d86d8, 19, Finished, Available, Finished)

In [4]:
from notebookutils import mssparkutils

# Path to the folder
folder_path = "Files/csv/FileOptimization/ecom"

# Delete the folder recursively
mssparkutils.fs.rm(folder_path, recurse=True)

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 6, Finished, Available, Finished)

True

In [5]:
from pyspark.sql.functions import col, to_date

csv_file_src = "Files/csv/FileOptimization/ecommerce_transactions.csv"
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_src)

# Extract date from Timestamp column
df = df.withColumn("TransactionDate", to_date(col("Timestamp")))

# Write partitioned CSV files by ProductCategory and TransactionDate
df.write \
    .format("csv") \
    .option("header", "true") \
    .partitionBy("ProductCategory", "TransactionDate") \
    .mode("overwrite") \
    .save(csv_file_partition)

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 7, Finished, Available, Finished)

In [24]:
from pyspark.sql.functions import col

csv_file_src = "Files/csv/FileOptimization/ecommerce_transactions.csv"
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_src)

# Write partitioned CSV files by ProductCategory
df.write \
    .format("csv") \
    .option("header", "true") \
    .partitionBy("ProductCategory") \
    .mode("overwrite") \
    .save(csv_file_partition)


StatementMeta(, 08c19f44-fe41-4ea1-987a-2fc5d90d86d8, 26, Finished, Available, Finished)

In [7]:
from pyspark.sql.functions import year

# Path to the partitioned CSV folder
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the partitioned data
df_partitioned = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_partition)

# Add Year column from Timestamp
df_with_year = df_partitioned.withColumn("Year", year(df_partitioned["Timestamp"]))

# Filter and aggregate for Electronics
electronics_sales_by_year = df_with_year.filter(df_with_year.ProductCategory == "Electronics") \
    .groupBy("Year") \
    .agg({"Amount": "sum"}) \
    .explain(mode="cost")         # Shows estimated cost (if available)

# Display the result
# display(electronics_sales_by_year)

StatementMeta(, a8475281-590d-44f3-9820-a4401fb64088, 9, Finished, Available, Finished)

== Optimized Logical Plan ==
Aggregate [Year#944], [Year#944, sum(Amount#932) AS sum(Amount)#963], Statistics(sizeInBytes=40.1 MiB)
+- Project [Amount#932, year(cast(Timestamp#934 as date)) AS Year#944], Statistics(sizeInBytes=40.1 MiB)
   +- Filter (isnotnull(ProductCategory#935) AND (ProductCategory#935 = Electronics)), Statistics(sizeInBytes=202.4 MiB)
      +- Relation [TransactionID#930,UserID#931,Amount#932,Country#933,Timestamp#934,ProductCategory#935,TransactionDate#936] csv, Statistics(sizeInBytes=202.4 MiB)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Year#944], functions=[sum(Amount#932)], output=[Year#944, sum(Amount)#963])
   +- Exchange hashpartitioning(Year#944, 200), ENSURE_REQUIREMENTS, [plan_id=408]
      +- HashAggregate(keys=[Year#944], functions=[partial_sum(Amount#932)], output=[Year#944, sum#968])
         +- Project [Amount#932, year(cast(Timestamp#934 as date)) AS Year#944]
            +- FileScan csv [Amount#932,Timestamp#934

In [15]:
value = spark.sparkContext.getConf().get("spark.microsoft.delta.optimize.fast.enabled", "not set")
print(f"Config value: {value}")

StatementMeta(, 08c19f44-fe41-4ea1-987a-2fc5d90d86d8, 17, Finished, Available, Finished)

Config value: not set


In [21]:
# List all configs from SparkContext
all_configs = spark.sparkContext.getConf().getAll()

# Filter for keys starting with 'spark.microsoft.delta'
delta_configs = [item for item in all_configs if item[0].startswith("spark.microsoft.delta")]

display(delta_configs)

StatementMeta(, 08c19f44-fe41-4ea1-987a-2fc5d90d86d8, 23, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e1f48fd7-8e1d-4ced-b37d-c7919622a919)

In [28]:
from pyspark.sql.functions import col

csv_file_src = "Files/csv/FileOptimization/ecommerce_transactions.csv"
csv_file_partition = "Files/csv/FileOptimization/ecom"

# Read the CSV file
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(csv_file_src)

# Write partitioned CSV files by ProductCategory
df.write \
    .format("csv") \
    .option("header", "true") \
    .partitionBy("ProductCategory") \
    .mode("overwrite") \
    .save(csv_file_partition)


StatementMeta(, 0ccc8d4b-1f2d-492c-9bea-11265d0fbf7a, 32, Finished, Available, Finished)

In [5]:
total_sales_by_year = df.filter(col("ProductCategory") == "Electronics")
total_sales_by_year.show()

StatementMeta(, 0ccc8d4b-1f2d-492c-9bea-11265d0fbf7a, 7, Finished, Available, Finished)

+-------------+------+---------------+------+---------+---------------+
|TransactionID|UserID|ProductCategory|Amount|  Country|      Timestamp|
+-------------+------+---------------+------+---------+---------------+
|            1| U0913|    Electronics|372.07|   Canada|  3/6/2023 0:48|
|           37| U1962|    Electronics|124.23|   France|12/17/2021 3:46|
|           52| U0807|    Electronics| 204.3|       UK| 7/26/2025 2:05|
|           63| U0469|    Electronics|294.17|   France|  6/5/2024 7:59|
|           64| U1290|    Electronics|480.59|      USA|5/25/2022 17:37|
|           75| U4334|    Electronics|335.59|   France|8/27/2022 12:45|
|           83| U4595|    Electronics| 60.38|    India| 8/31/2023 6:08|
|           84| U4470|    Electronics|418.12|   France|5/23/2024 17:09|
|           94| U1827|    Electronics|331.72|  Germany| 7/22/2022 5:39|
|           97| U2713|    Electronics| 62.09|       UK|  9/5/2022 4:16|
|          101| U2087|    Electronics|355.85|      USA|8/30/2024

In [4]:
from pyspark.sql.functions import col, year, sum as _sum

csv_file = "Files/csv/FileOptimization/ecommerce_transactions.csv"

df = spark.read.format("csv").option("header","true").load(csv_file)
# Aggregate total sales by year for Electronics
total_sales_by_year = df.filter(col("ProductCategory") == "Electronics") \
    .withColumn("Year", year(col("Timestamp"))) \
    .groupBy("Year") \
    .agg(_sum("Amount").alias("TotalSales")) \
    .orderBy("Year")

# Show the result
total_sales_by_year.show()

StatementMeta(, 0ccc8d4b-1f2d-492c-9bea-11265d0fbf7a, 6, Finished, Available, Finished)

+----+------------------+
|Year|        TotalSales|
+----+------------------+
|NULL|1695943.5799999996|
+----+------------------+



In [11]:
from pyspark.sql.functions import col

trans_prod_cat = df.groupBy(["ProductCategory", "Timestamp"]) \
    .count() \
    .withColumnRenamed("count", "NbrOfTransaction") \
    .orderBy(col("NbrOfTransaction").desc())

display(trans_prod_cat["Timestamp==5/26/2022 16:39"])

StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c4b0b71d-e13a-4f1e-bf7e-82d20ffab42b)

In [17]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

from pyspark.sql.functions import col, to_date

# Convert Timestamp to date format and filter
filtered_df = trans_prod_cat.filter(
    to_date(col("Timestamp"), "M/d/yyyy") == "2022-05-26"
)

# Display the result
display(filtered_df)

StatementMeta(, 4d6ff630-3c64-4201-9a01-a4a8d30a0833, 19, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6d19e3f1-fe84-4354-b286-719f52008d55)