# generate sample orders.csv (large table)

In [24]:
import pandas as pd
import numpy as np

# Generate dummy data
num_rows = 5_000_000  # Approximate size to get ~80MB (depends on data types)
np.random.seed(42)

data = {
    "order_id": np.arange(1, num_rows + 1),
    "customer_id": np.random.randint(100, 200, size=num_rows),
    "amount": np.random.randint(10, 1000, size=num_rows)
}

df = pd.DataFrame(data)

# Save as CSV
file_path = "SampleData/orders.csv"
df.to_csv(file_path, index=False)

file_path, df.shape

('SampleData/orders.csv', (5000000, 3))

# generate sample customers.csv (small table)

In [26]:
import random
import numpy as np
import pandas as pd

# Generate ~50k rows to get around ~1MB CSV
num_rows_customers = 50_000

# Some meaningful first names and last names
first_names = ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Helen", "Ivy", "Jack"]
last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin"]

# Generate random customer data
customer_ids = np.arange(100, 100 + num_rows_customers)
names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(num_rows_customers)]

customers_df_dummy = pd.DataFrame({
    "customer_id": customer_ids,
    "name": names
})

# Save to CSV
customers_file_path = "SampleData/customers.csv"
customers_df_dummy.to_csv(customers_file_path, index=False)

customers_file_path, customers_df_dummy.shape, customers_df_dummy.head()


('SampleData/customers.csv',
 (50000, 2),
    customer_id          name
 0          100  Bob Anderson
 1          101  Alice Thomas
 2          102  Frank Harris
 3          103   Frank Brown
 4          104    Jack Smith)

# PySpark DataFrames without using local data 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import numpy as np
import random
import pandas as pd
from pyspark.sql.functions import * 

# Start Spark
spark = SparkSession.builder.appName("JoinExample").getOrCreate()

# -------------------------------
# Create Orders DataFrame (~80MB)
# -------------------------------
num_rows_orders = 5_000_000
np.random.seed(42)

orders_data = list(zip(
    np.arange(1, num_rows_orders + 1).tolist(),
    np.random.randint(100, 200, size=num_rows_orders).tolist(),
    np.random.randint(10, 1000, size=num_rows_orders).tolist()
))

orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("amount", IntegerType(), True)
])

orders_df = spark.createDataFrame(orders_data, schema=orders_schema)
print(f"Orders DataFrame rows: {orders_df.count()}")
orders_df.show(5)

# -------------------------------
# Create Customers DataFrame (~1MB)
# -------------------------------
num_rows_customers = 50_000

first_names = ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Helen", "Ivy", "Jack"]
last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin"]

customer_ids = list(range(100, 100 + num_rows_customers))  # plain Python ints
names = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(num_rows_customers)]

customers_data = list(zip(customer_ids, names))

customers_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("name", StringType(), True)
])

customers_df = spark.createDataFrame(customers_data, schema=customers_schema)
print(f"Customers DataFrame rows: {customers_df.count()}")
customers_df.show(5)



Orders DataFrame rows: 5000000
+--------+-----------+------+
|order_id|customer_id|amount|
+--------+-----------+------+
|       1|        151|   142|
|       2|        192|   165|
|       3|        114|   827|
|       4|        171|   765|
|       5|        160|   211|
+--------+-----------+------+
only showing top 5 rows
Customers DataFrame rows: 50000
+-----------+-------------+
|customer_id|         name|
+-----------+-------------+
|        100|  Helen Brown|
|        101|Helen Jackson|
|        102|  Ivy Jackson|
|        103|  Grace White|
|        104|  Frank White|
+-----------+-------------+
only showing top 5 rows


In [3]:
# Perform a shuffle join
import time

# Disable auto broadcast join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) # disable broadcast joins or else spark is smart enough to do it automatically
start = time.time()

# Trigger Spark action
shuffleJoin_df = orders_df.join(customers_df, on="customer_id", how="inner")
shuffleJoin_df.count()   # count is better than show() for timing because show() only pulls first 20 rows

end = time.time()

print(f"shuffle Join execution time: {end - start:.2f} seconds")


shuffle Join execution time: 23.04 seconds


In [None]:
# Force broadcast join
import time
start = time.time()
broadcast_join_df = orders_df.join(broadcast(customers_df), on="customer_id", how="inner")

broadcast_join_df.count()
end = time.time()

print(f"Broadcast Join execution time: {end - start:.2f} seconds")

Broadcast Join execution time: 10.49 seconds


# num_rows_orders Vary between (1M->10M)
# num_rows_customers (10k → 100k)
# Measure time taken for 2 join operation

In [8]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import broadcast
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import pandas as pd

# Start Spark
spark = SparkSession.builder.appName("JoinBenchmark").getOrCreate()

def generate_orders(spark, num_rows_orders):
    orders_data = list(zip(
        np.arange(1, num_rows_orders + 1).tolist(),
        np.random.randint(100, 200, size=num_rows_orders).tolist(),
        np.random.randint(10, 1000, size=num_rows_orders).tolist()
    ))

    orders_schema = StructType([
        StructField("order_id", IntegerType(), True),
        StructField("customer_id", IntegerType(), True),
        StructField("amount", IntegerType(), True)
    ])

    return spark.createDataFrame(orders_data, schema=orders_schema)


def generate_customers(spark, num_rows_customers):
    first_names = ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Helen", "Ivy", "Jack"]
    last_names = ["Smith", "Johnson", "Brown", "Taylor", "Anderson", "Thomas", "Jackson", "White", "Harris", "Martin"]

    customer_ids = list(range(100, 100 + num_rows_customers))  # plain Python ints
    names = [f"{random.choice(first_names)} {random.choice(last_names)}"
             for _ in range(num_rows_customers)]

    customers_data = list(zip(customer_ids, names))

    customers_schema = StructType([
        StructField("customer_id", IntegerType(), True),
        StructField("name", StringType(), True)
    ])

    return spark.createDataFrame(customers_data, schema=customers_schema)


def measure_time(orders_df, customers_df, join_type="shuffle"):
    start = time.time()

    if join_type == "shuffle":
        spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)  # disable broadcast
        df = orders_df.join(customers_df, on="customer_id", how="inner")
    else:  # broadcast
        df = orders_df.join(broadcast(customers_df), on="customer_id", how="inner")

    df.count()  # force action
    end = time.time()
    return round(end - start, 2)


# Benchmark parameters
orders_sizes = [1_000_000, 5_000_000, 10_000_000]
customers_sizes = [10_000, 50_000, 100_000]

results = []

for o_size in orders_sizes:
    for c_size in customers_sizes:
        print(f"\nTesting orders={o_size:,}, customers={c_size:,}")

        orders_df = generate_orders(spark, o_size)
        customers_df = generate_customers(spark, c_size)

        shuffle_time = measure_time(orders_df, customers_df, join_type="shuffle")
        broadcast_time = measure_time(orders_df, customers_df, join_type="broadcast")

        results.append({
            "orders": o_size,
            "customers": c_size,
            "shuffle_time": shuffle_time,
            "broadcast_time": broadcast_time
        })

# Convert to Pandas for plotting
results_df = pd.DataFrame(results)
print("\nBenchmark Results:")
print(results_df)

# -------------------------------
# Plotting
# -------------------------------
fig, ax = plt.subplots(figsize=(10,6))

for c_size in customers_sizes:
    subset = results_df[results_df["customers"] == c_size]
    ax.plot(subset["orders"], subset["shuffle_time"], marker="o", label=f"Shuffle (cust={c_size})")
    ax.plot(subset["orders"], subset["broadcast_time"], marker="s", linestyle="--", label=f"Broadcast (cust={c_size})")

ax.set_xlabel("Number of Orders")
ax.set_ylabel("Execution Time (seconds)")
ax.set_title("Shuffle Join vs Broadcast Join Performance")
ax.legend()
plt.grid(True)
plt.show()

# result summary:




In [None]:
'''

# result summary:


                                shuffle Join                Broadcast Join
Customers ~=1mb Orders~=80mb:   23 seconds                 10.5 seconds


'''
