In [0]:
from pyspark.sql.functions import *

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Create Spark session
spark = SparkSession.builder.appName("CreateProductsOrders").getOrCreate()

# ---------------------
# Products Table
# ---------------------
products_data = [
    (1, "Leetcode Solutions", "Book"),
    (2, "Jewels of Stringology", "Book"),
    (3, "HP", "Laptop"),
    (4, "Lenovo", "Laptop"),
    (5, "Leetcode Kit", "T-shirt")
]

products_schema = StructType([
    StructField("product_id", IntegerType(), False),
    StructField("product_name", StringType(), False),
    StructField("product_category", StringType(), False)
])

products_df = spark.createDataFrame(data=products_data, schema=products_schema)

# ---------------------
# Orders Table
# ---------------------
orders_data = [
    (1, "2020-02-05", 60),
    (1, "2020-02-10", 70),
    (2, "2020-01-18", 30),
    (2, "2020-02-11", 80),
    (3, "2020-02-17", 2),
    (3, "2020-02-24", 3),
    (4, "2020-03-01", 20),
    (4, "2020-03-04", 30),
    (4, "2020-03-04", 60),
    (5, "2020-02-25", 50),
    (5, "2020-02-27", 50),
    (5, "2020-03-01", 50)
]

orders_schema = StructType([
    StructField("product_id", IntegerType(), False),
    StructField("order_date", StringType(), False),  # Keep as string if parsing not needed
    StructField("unit", IntegerType(), False)
])

orders_df = spark.createDataFrame(data=orders_data, schema=orders_schema)

# Show both tables
print("📘 Products Table:")
products_df.show(truncate=False)

print("🧾 Orders Table:")
orders_df.show(truncate=False)


In [0]:
SELECT p.product_name, SUM(o.unit) AS unit
FROM products p
JOIN orders o ON p.product_id = o.product_id
WHERE o.order_date BETWEEN '2020-02-01' AND '2020-02-29'
GROUP BY p.product_name
HAVING SUM(o.unit) >= 100;


In [0]:
result_df = products_df.join(orders_df, on="product_id", how="inner").filter((orders_df.order_date >= "2020-02-01") & (orders_df.order_date <= "2020-02-29")).groupBy("product_name").agg(sum("unit").alias("unit")).filter(col("unit") >= 100)

display(result_df)

In [0]:
result_df = distinct_activities_df.groupBy("sell_date").agg(count("product").alias("num_sold"),concat_ws(",", array_sort(collect_set("product"))).alias("products")).orderBy("sell_date")
display(result_df)