In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col
from datetime import datetime, timedelta

def create_spark_session(app_name="daily_dump"):
    return SparkSession.builder \
        .appName(app_name) \
        .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
        .config("hive.metastore.uris", "thrift://localhost:9083") \
        .enableHiveSupport() \
        .getOrCreate()
spark = create_spark_session()

# Load data from Hive tables
fact_sales = spark.table("casestudy.fact_sales")
dim_product = spark.table("casestudy.dim_product")
dim_agent = spark.table("casestudy.dim_agent")
dim_sales = spark.table("casestudy.dim_sales")  # Assuming this is where transaction_date resides

# Verify data is loaded
print("fact_sales count:", fact_sales.count())
print("dim_product count:", dim_product.count())
print("dim_agent count:", dim_agent.count())
print("dim_sales count:", dim_sales.count())

# Join tables to get the necessary data
daily_dump_df = fact_sales.join(dim_agent, fact_sales["sales_agent_id"] == dim_agent["sales_person_id"], "inner") \
                          .join(dim_product, fact_sales["product_id"] == dim_product["product_id"], "inner") \
                          .join(dim_sales, fact_sales["transaction_id"] == dim_sales["transaction_id"], "inner") \
                          .select(dim_agent["sales_agent_name"], dim_product["product_name"], fact_sales["total_paid_price_after_discount"], dim_sales["transaction_date"])

# Verify join results
print("daily_dump_df count:", daily_dump_df.count())


fact_sales count: 1500
dim_product count: 30
dim_agent count: 10
dim_sales count: 1500
daily_dump_df count: 0
