In [2]:
df = spark.read.format("csv").option("header","true").load("Files/csv/FileOptimization/ecommerce_transactions.csv")
# df now is a Spark DataFrame containing CSV data from "Files/csv/ecommerce_transactions.csv".
display(df.limit(5))

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b1b4ca53-b651-401e-9898-26a4075dae20)

In [3]:
display(df.count())

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 5, Finished, Available, Finished)

5000000

In [6]:
from pyspark.sql import SparkSession

# Assuming SparkSession is already created and df is defined
spark = SparkSession.builder.getOrCreate()

# Write the DataFrame to a Delta table, overwriting if it exists
df.write.format("delta").mode("overwrite").saveAsTable("ElectronicCom")


StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 8, Finished, Available, Finished)

In [7]:
spark.conf.get("spark.microsoft.delta.stats.collect.extended")

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 9, Finished, Available, Finished)

'false'

In [8]:
#Enable extended statistics collection:
spark.conf.set("spark.microsoft.delta.stats.collect.extended", "true")

#Enable statistics injection into the query optimizer:
spark.conf.set("spark.microsoft.delta.stats.injection.enabled", "true")

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 10, Finished, Available, Finished)

In [9]:
%%spark

// scala
val stats = spark.read.table("ElectronicCom").queryExecution.optimizedPlan.stats

val sizeInBytes = stats.sizeInBytes
val rowCount = if (stats.rowCount.isDefined) stats.rowCount.get.toString.toDouble else -1.0

val formattedRowCount = f"$rowCount%.0f"

println(s"Size in bytes: $sizeInBytes")
println(s"Row count: $formattedRowCount")

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 12, Finished, Available, Finished)

Size in bytes: 615000000
Row count: 5000000
stats: org.apache.spark.sql.catalyst.plans.logical.Statistics = Statistics(sizeInBytes=586.5 MiB, rowCount=5.00E+6)
sizeInBytes: BigInt = 615000000
rowCount: Double = 5000000.0
formattedRowCount: String = 5000000


In [10]:
# using Pyspark
df = spark.read.table("ElectronicCom")

# Access JVM-side optimized plan stats
stats = df._jdf.queryExecution().optimizedPlan().stats()

# Extract specific metrics
size_in_bytes = stats.sizeInBytes()
row_count = stats.rowCount().get() if stats.rowCount().isDefined() else "undefined"

print(f"Size in bytes: {size_in_bytes}")
print(f"Row count: {row_count}")


StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 13, Finished, Available, Finished)

Size in bytes: 615000000
Row count: 5000000


In [18]:
%%sql
SELECT COUNT(1) FROM ElectronicCom;

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 21, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

In [19]:
%%sql
DELETE FROM ElectronicCom
WHERE ProductCategory = 'Electronics'

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 22, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

In [20]:
df = spark.read.table("ElectronicCom")
display(df.count())

StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 23, Finished, Available, Finished)

4663132

In [21]:
# using Pyspark
df = spark.read.table("ElectronicCom")

# Access JVM-side optimized plan stats
stats = df._jdf.queryExecution().optimizedPlan().stats()

# Extract specific metrics
size_in_bytes = stats.sizeInBytes()
row_count = stats.rowCount().get() if stats.rowCount().isDefined() else "undefined"

print(f"Size in bytes: {size_in_bytes}")
print(f"Row count: {row_count}")


StatementMeta(, c4357b31-97f4-4b75-8944-8ae097d5d79d, 24, Finished, Available, Finished)

Size in bytes: 568902104
Row count: 4663132
