In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

spark = SparkSession.builder.appName("gold_layer_conform_zone").getOrCreate()

# Paths to silver Delta tables
sales_silver_path = '/Volumes/workspace/hls_demo_2/refined_data/sales_data_silver_delta'
inventory_silver_path = '/Volumes/workspace/hls_demo_2/refined_data/inventory_data_silver_delta'
clinical_trial_silver_path = '/Volumes/workspace/hls_demo_2/refined_data/clinical_trial_data_silver_delta'

# Load Silver layer data
sales_silver_df = spark.read.format('delta').load(sales_silver_path)
inventory_silver_df = spark.read.format('delta').load(inventory_silver_path)
clinical_trial_silver_df = spark.read.format('delta').load(clinical_trial_silver_path)

# 1. Join sales and inventory on product_code/product_name (may need to map product_name to product_code)
# Note: sales_silver_df has product_name, inventory_silver_df has product_code.
# You might want to create a lookup or join on product_code if possible.
# For simplicity, let's assume product_code is available in both datasets or map accordingly.

# Since sales_silver_df doesn't have product_code, but product_name,
# and inventory_silver_df only has product_code, we need a mapping between them.
# Let's create a product lookup table from bronze or silver (if available).

# For demo, let's create a simple product lookup (should ideally come from a master data table)
product_lookup = sales_silver_df.select("product_name").distinct().withColumn("product_code", lit(None))

# Join sales with product lookup to add product_code (if missing)
sales_with_code_df = sales_silver_df.join(product_lookup, on="product_name", how="left")

# Join sales_with_code and inventory on product_code
gold_sales_inventory_df = sales_with_code_df.join(
    inventory_silver_df,
    on="product_code",
    how="inner"
).select(
    'product_code',
    'product_name',
    'region',
    'year',
    'month',
    'total_sales_qty',
    'total_sales_amount',
    'location',
    'total_stock_qty',
    'total_reorder_point',
    'stock_to_reorder_ratio'
)

# 2. Combine clinical trial data separately, since it’s drug-focused not product-focused
# You could join clinical trial data with sales data on product_name and drug_name if names align.

gold_combined_df = gold_sales_inventory_df.join(
    clinical_trial_silver_df.withColumnRenamed('drug_name', 'product_name'),
    on='product_name',
    how='left'
).select(
    'product_code',
    'product_name',
    'region',
    'year',
    'month',
    'total_sales_qty',
    'total_sales_amount',
    'location',
    'total_stock_qty',
    'total_reorder_point',
    'stock_to_reorder_ratio',
    'avg_success_rate',
    'avg_trial_duration_days'
)

# Show a sample of the Gold Layer data
gold_combined_df.show(10, truncate=False)

# Save Gold layer Delta table
gold_path_delta = '/Volumes/workspace/hls_demo_2/conform_data/gold_layer_delta'
gold_combined_df.write.format('delta').mode('overwrite').save(gold_path_delta)
