# Example 4: Joining DataFrames

Learn different types of joins and how to combine datasets.

This demonstrates:
- Inner joins
- Left and right joins
- Outer joins
- Handling duplicate column names

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("JoinsExample").getOrCreate()

In [None]:
# Create customers dataset
customers_data = [
    (1, "Alice", "New York"),
    (2, "Bob", "Los Angeles"),
    (3, "Charlie", "Chicago"),
    (4, "Diana", "Houston"),
    (5, "Eve", "Phoenix")
]
customers = spark.createDataFrame(customers_data, ["customer_id", "name", "city"])

print("Customers:")
customers.show()

In [None]:
# Create orders dataset
orders_data = [
    (101, 1, 250.00, "2023-01-15"),
    (102, 2, 175.50, "2023-01-16"),
    (103, 1, 300.00, "2023-01-17"),
    (104, 3, 450.00, "2023-01-18"),
    (105, 6, 200.00, "2023-01-19")  # Customer 6 doesn't exist
]
orders = spark.createDataFrame(orders_data, ["order_id", "customer_id", "amount", "order_date"])

print("Orders:")
orders.show()

In [None]:
# Inner Join - only matching records
inner_join = customers.join(orders, "customer_id", "inner")

print("\nInner Join (only customers with orders):")
inner_join.show()

In [None]:
# Left Join - all customers, with or without orders
left_join = customers.join(orders, "customer_id", "left")

print("\nLeft Join (all customers):")
left_join.show()

In [None]:
# Right Join - all orders, with or without customer info
right_join = customers.join(orders, "customer_id", "right")

print("\nRight Join (all orders):")
right_join.show()

In [None]:
# Full Outer Join - all records from both
outer_join = customers.join(orders, "customer_id", "outer")

print("\nFull Outer Join (all customers and orders):")
outer_join.show()

In [None]:
# Find customers without orders (left anti join)
no_orders = customers.join(orders, "customer_id", "left_anti")

print("\nCustomers without orders:")
no_orders.show()

In [None]:
# Aggregate after join - total spending per customer
customer_spending = customers.join(orders, "customer_id", "left") \
    .groupBy("customer_id", "name", "city") \
    .agg(
        F.sum("amount").alias("total_spent"),
        F.count("order_id").alias("order_count")
    ) \
    .orderBy(F.col("total_spent").desc())

print("\nCustomer Spending Summary:")
customer_spending.show()

In [None]:
# Join on different column names
# Create a product details table
products_data = [
    (101, "Laptop"),
    (102, "Phone"),
    (103, "Tablet")
]
products = spark.createDataFrame(products_data, ["prod_id", "product_name"])

# Join on different column names
orders_renamed = orders.withColumnRenamed("order_id", "prod_id")
order_products = orders_renamed.join(products, "prod_id", "inner")

print("\nOrders with Product Names:")
order_products.show()

In [None]:
spark.stop()