In [0]:
import pyspark.sql.functions as F

orders = spark.table("workspace.default.silver_orders")
customers = spark.table("workspace.default.silver_customers")

narrow_df= (orders
.select("CustomerID","OrderId","Amount","OrderDate")
.filter(F.col("Amount")>=50)
.withColumn("Amount2",F.col("Amount")*2)
)
narrow_df.explain(True)
display(narrow_df)

In [0]:
gb = (orders
  .groupBy("CustomerId")
  .agg(F.sum("Amount").alias("TotalAmount"))
)

gb.explain(True)
display(gb)

In [0]:
print("Current partitions: " ,spark.conf.get("spark.sql.shuffle.partitions"))
orders.explain(True)
orders_rep=orders.coalesce(2)
orders_rep.explain(True)
display(orders_rep)


In [0]:
import pyspark.sql.functions as F
import time

orders = spark.table("workspace.default.silver_orders")
customers = spark.table("workspace.default.silver_customers")

heavy = (orders
  .join(customers.select("CustomerId","Country"), "CustomerId", "left")
  .groupBy("Country")
  .agg(
      F.count("*").alias("TotalOrders"),
      F.sum("Amount").alias("TotalRevenue")
  )
)

t0 = time.time()
heavy.count()   # action 1 (triggers full compute)
t1 = time.time()

heavy.count()   # action 2 (recomputes again)
t2 = time.time()

print("No cache - Run1 seconds:", round(t1 - t0, 3))
print("No cache - Run2 seconds:", round(t2 - t1, 3))


heavy_cached = heavy.cache() 
t0 = time.time()
heavy_cached.count()           # materialize cache
t1 = time.time()

heavy_cached.count()           # should be faster
t2 = time.time()

print("With cache - Materialize seconds:", round(t1 - t0, 3))
print("With cache - Run2 seconds:", round(t2 - t1, 3))


In [0]:
import pyspark.sql.functions as F

orders = spark.table("workspace.default.silver_orders")
customers = spark.table("workspace.default.silver_customers").select("CustomerId", "Country")
j_normal = orders.join(customers, "CustomerId", "left")
j_normal.explain(True)
#j_broadcast = orders.join(F.broadcast(customers), "CustomerId", "left")
#j_broadcast.explain(True)

In [0]:
import pyspark.sql.functions as F

orders = spark.table("workspace.default.silver_orders")

key_counts = (orders
  .groupBy("CustomerId")
  .agg(F.count("*").alias("cnt"))
  .orderBy(F.col("cnt").desc())
)

display(key_counts)
total = orders.count()

skew_report = (key_counts
  .withColumn("share_pct", F.round((F.col("cnt") / F.lit(total)) * 100, 2))
)
display(skew_report)

In [0]:

salt_n = 10
fact_salted = orders.withColumn("salt", (F.rand() * salt_n).cast("int"))
display(fact_salted)

salts= spark.range(salt_n).withColumnRenamed("id", "salt")
#dim_salted=customers.crossjoin(salts)
display(salts)
#display(dim_salted)
