# Broadcast Join

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName("Broadcasting Example").getOrCreate() 

In [None]:


# Create a small DataFrame with product categories and discounts 
category_data = [("A", 10), ("B", 20), ("C", 30)] 
category_df = spark.createDataFrame(category_data, ["category", "discount"]) 

In [None]:
from pyspark.sql.functions import broadcast 
        
broadcast_category_df = broadcast(category_df) 

In [None]:
# Create a larger DataFrame with sales data 
sales_data = [("product1", "A", 100), ("product2", "B", 200), ("product3", "C", 300)] 
sales_df = spark.createDataFrame(sales_data, ["product", "category", "revenue"]) 

# Join the sales DataFrame with the broadcasted category DataFrame 
result_df = sales_df.join(broadcast_category_df, on="category") 

# Window function

In [14]:
data = [(1, "A", 100),
        (1, "B", 200),
        (2, "A", 150),
        (2, "B", 250),
        (3, "A", 120),
        (3, "B", 180),
        (4, "A", 150)]
columns = ["ID", "Category", "Value"]
df = spark.createDataFrame(data, columns)
df.show()

+---+--------+-----+
| ID|Category|Value|
+---+--------+-----+
|  1|       A|  100|
|  1|       B|  200|
|  2|       A|  150|
|  2|       B|  250|
|  3|       A|  120|
|  3|       B|  180|
|  4|       A|  150|
+---+--------+-----+



In [15]:
from pyspark.sql.window import Window

# Define a window specification
window_spec = Window.partitionBy("Category")
window_spec

<pyspark.sql.window.WindowSpec at 0x25626a5e590>

In [16]:
# Calculate sum and average value by category using window functions
df_with_aggregates = df.withColumn("SumValue", sum("Value").over(window_spec)) \
                       .withColumn("AvgValue", avg("Value").over(window_spec))

df_with_aggregates.show()

+---+--------+-----+--------+--------+
| ID|Category|Value|SumValue|AvgValue|
+---+--------+-----+--------+--------+
|  1|       A|  100|     520|   130.0|
|  2|       A|  150|     520|   130.0|
|  3|       A|  120|     520|   130.0|
|  4|       A|  150|     520|   130.0|
|  1|       B|  200|     630|   210.0|
|  2|       B|  250|     630|   210.0|
|  3|       B|  180|     630|   210.0|
+---+--------+-----+--------+--------+



In [18]:
from pyspark.sql.functions import row_number

# Define a window specification for ranking
ranking_window = Window.partitionBy("Category").orderBy(col("Value").desc())
# Add a rank column using row_number window function
ranked_df = df.withColumn("Rank", row_number().over(ranking_window))
# Show the ranked DataFrame
ranked_df.show()

+---+--------+-----+----+
| ID|Category|Value|Rank|
+---+--------+-----+----+
|  2|       A|  150|   1|
|  4|       A|  150|   2|
|  3|       A|  120|   3|
|  1|       A|  100|   4|
|  2|       B|  250|   1|
|  1|       B|  200|   2|
|  3|       B|  180|   3|
+---+--------+-----+----+



In [19]:
# Define a window specification for lag function
lag_window = Window.partitionBy("Category").orderBy("ID")
# Add a lag column using lag window function
lagged_df = df.withColumn("PreviousValue", lag("Value", 1).over(lag_window))
# Show the DataFrame with lagged values
lagged_df.show()

+---+--------+-----+-------------+
| ID|Category|Value|PreviousValue|
+---+--------+-----+-------------+
|  1|       A|  100|         NULL|
|  2|       A|  150|          100|
|  3|       A|  120|          150|
|  4|       A|  150|          120|
|  1|       B|  200|         NULL|
|  2|       B|  250|          200|
|  3|       B|  180|          250|
+---+--------+-----+-------------+

