In [1]:
import duckdb
import pandas as pd

In [2]:
# Synthesizing customer data
customers_data = {
    'customer_id': [1, 2, 3, 4],
    'customer_name': ['Alice', 'Bob', 'Charlie', 'David'],
    'location': ['New York', 'Los Angeles', 'Chicago', 'Los Angeles']
}

# Synthesizing orders data
orders_data = {
    'order_id': [101, 102, 103, 104, 105],
    'customer_id': [1, 2, 2, 4, 5],
    'amount': [250, 450, 320, 150, 200]
}

# Creating data frames
customers_df = pd.DataFrame(customers_data)
orders_df = pd.DataFrame(orders_data)

# Display the data frames
print("Customers Data:")
display(customers_df)

print("Orders Data:")
display(orders_df)


Customers Data:


Unnamed: 0,customer_id,customer_name,location
0,1,Alice,New York
1,2,Bob,Los Angeles
2,3,Charlie,Chicago
3,4,David,Los Angeles


Orders Data:


Unnamed: 0,order_id,customer_id,amount
0,101,1,250
1,102,2,450
2,103,2,320
3,104,4,150
4,105,5,200


In [3]:
# Loading data into DuckDB
duckdb.sql("CREATE TABLE customers AS SELECT * FROM customers_df")
duckdb.sql("CREATE TABLE orders AS SELECT * FROM orders_df")


In [4]:
# Inner Join: Return only matching records
inner_join_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
INNER JOIN orders
ON customers.customer_id = orders.customer_id
"""
inner_join_result = duckdb.sql(inner_join_query).df()
print("Inner Join Result:")
display(inner_join_result)


Inner Join Result:


Unnamed: 0,customer_name,location,amount
0,Alice,New York,250
1,Bob,Los Angeles,320
2,David,Los Angeles,150
3,Bob,Los Angeles,450


In [5]:
# Left Join: Return all records from the left table (customers) and matching from right table (orders)
left_join_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
LEFT JOIN orders
ON customers.customer_id = orders.customer_id
"""
left_join_result = duckdb.sql(left_join_query).df()
print("Left Join Result:")
display(left_join_result)


Left Join Result:


Unnamed: 0,customer_name,location,amount
0,Alice,New York,250.0
1,Bob,Los Angeles,320.0
2,David,Los Angeles,150.0
3,Bob,Los Angeles,450.0
4,Charlie,Chicago,


In [6]:
# Left Join: Return only non-matching records from the left table (customers)
left_join_non_matches_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
LEFT JOIN orders
ON customers.customer_id = orders.customer_id
WHERE orders.customer_id IS NULL
"""
left_join_non_matches_result = duckdb.sql(left_join_non_matches_query).df()
print("Left Join - Non-Matching Rows Only:")
display(left_join_non_matches_result)


Left Join - Non-Matching Rows Only:


Unnamed: 0,customer_name,location,amount
0,Charlie,Chicago,


In [7]:
# Right Join: Return all records from the right table (orders) and matching from left table (customers)
right_join_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
RIGHT JOIN orders
ON customers.customer_id = orders.customer_id
"""
right_join_result = duckdb.sql(right_join_query).df()
print("Right Join Result:")
display(right_join_result)


Right Join Result:


Unnamed: 0,customer_name,location,amount
0,Alice,New York,250
1,Bob,Los Angeles,320
2,David,Los Angeles,150
3,Bob,Los Angeles,450
4,,,200


In [8]:
# Right Join: Return only non-matching records from the right table (orders)
right_join_non_matches_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
RIGHT JOIN orders
ON customers.customer_id = orders.customer_id
WHERE customers.customer_id IS NULL
"""
right_join_non_matches_result = duckdb.sql(right_join_non_matches_query).df()
print("Right Join - Non-Matching Rows Only:")
display(right_join_non_matches_result)


Right Join - Non-Matching Rows Only:


Unnamed: 0,customer_name,location,amount
0,,,200


In [9]:
# Full Outer Join: Return all records from both tables, showing NULL for non-matches
full_outer_join_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
FULL OUTER JOIN orders
ON customers.customer_id = orders.customer_id
"""
full_outer_join_result = duckdb.sql(full_outer_join_query).df()
print("Full Outer Join Result:")
display(full_outer_join_result)


Full Outer Join Result:


Unnamed: 0,customer_name,location,amount
0,Alice,New York,250.0
1,Bob,Los Angeles,320.0
2,David,Los Angeles,150.0
3,Bob,Los Angeles,450.0
4,Charlie,Chicago,
5,,,200.0


In [10]:
# Full Outer Join: Return only non-matching records from both tables
full_outer_join_non_matches_query = """
SELECT customers.customer_name, customers.location, orders.amount
FROM customers
FULL OUTER JOIN orders
ON customers.customer_id = orders.customer_id
WHERE customers.customer_id IS NULL OR orders.customer_id IS NULL
"""
full_outer_join_non_matches_result = duckdb.sql(full_outer_join_non_matches_query).df()
print("Full Outer Join - Non-Matching Rows Only:")
display(full_outer_join_non_matches_result)


Full Outer Join - Non-Matching Rows Only:


Unnamed: 0,customer_name,location,amount
0,Charlie,Chicago,
1,,,200.0


In [11]:
# Self Join: Comparing customers from the same location
self_join_query = """
SELECT a.customer_name AS customer_1, b.customer_name AS customer_2, a.location
FROM customers a
JOIN customers b
ON a.location = b.location
WHERE a.customer_id <> b.customer_id
"""
self_join_result = duckdb.sql(self_join_query).df()
print("Self Join Result (Customers from the same location):")
display(self_join_result)


Self Join Result (Customers from the same location):


Unnamed: 0,customer_1,customer_2,location
0,Bob,David,Los Angeles
1,David,Bob,Los Angeles
