## Silver Table
The silver table cleanses data points and filters for the columns we need

In [0]:
from pyspark.sql import functions as F

In [0]:
# Retrieve job parameters passed in from the Databricks workflow
dbutils.widgets.text("billing_period", "")
dbutils.widgets.text("bronze_table_name", "")
dbutils.widgets.text("target_catalog_name", "")
dbutils.widgets.text("target_schema_name", "")
dbutils.widgets.text("silver_table_name", "")
dbutils.widgets.text("tracker_table_name", "")

bronze_table_name = dbutils.widgets.get("bronze_table_name")
target_catalog_name = dbutils.widgets.get("target_catalog_name")
target_schema_name = dbutils.widgets.get("target_schema_name")
silver_table_name = dbutils.widgets.get("silver_table_name")
tracker_table_name = dbutils.widgets.get("tracker_table_name")
billing_period = dbutils.widgets.get("billing_period").strip()

In [0]:
#Creating 3 level namespace
bronze_tbl = f'{target_catalog_name}.{target_schema_name}.{bronze_table_name}'
target_tbl = f'{target_catalog_name}.{target_schema_name}.{silver_table_name}'
tracker_tbl = f'{target_catalog_name}.{target_schema_name}.{tracker_table_name}'

In [0]:
#Creating silver table
spark.sql(f"""
          CREATE TABLE IF NOT EXISTS {target_tbl} (
            record_id STRING,
            billing_period STRING,
            usage_start_time TIMESTAMP,
            usage_end_time TIMESTAMP,
            aws_account_id STRING,
            aws_account_name STRING,
            product_code STRING,
            product_name STRING,
            product_family STRING,
            product_region_code STRING,
            instance_family STRING,
            instance_type STRING,
            resource_id STRING,
            item_type STRING,
            operation STRING,
            usage_type STRING,
            cluster_id STRING,
            cluster_name STRING,
            cluster_creator STRING,
            job_run_name STRING,
            warehouse_id STRING,
            instance_pool_id STRING,
            job_id STRING,
            currency_code STRING,
            pricing_term STRING,
            pricing_unit STRING,
            reservation_id STRING,
            savings_plan_id STRING,
            usage_amount DOUBLE,
            discount DOUBLE,
            unblended_cost DOUBLE,
            net_unblended_cost DOUBLE,
            amortized_cost DOUBLE,
            net_amortized_cost DOUBLE)
            CLUSTER BY AUTO
""")

In [0]:
try:
    # Read bronze table and grab important fields
    df = spark.sql(f"""
        SELECT 
            --General data
            identity_line_item_id as record_id,
            billing_period as billing_period,
            line_item_usage_start_date as usage_start_time,
            line_item_usage_end_date as usage_end_time,
            line_item_usage_account_id as aws_account_id,
            line_item_usage_account_name as aws_account_name,
            --Product data
            line_item_product_code as product_code,
            product['product_name'] as product_name,
            product_product_family as product_family,
            product['region'] as product_region_code,
            product_instance_family as instance_family,
            product_instance_type as instance_type,
            line_item_resource_id as resource_id,
            line_item_line_item_type as item_type,
            line_item_operation as operation,
            line_item_usage_type as usage_type,

            --Tags
            resource_tags['user_cluster_id'] as cluster_id,
            resource_tags['user_cluster_name'] as cluster_name,
            resource_tags['user_creator'] as cluster_creator,
            resource_tags['user_run_name'] as job_run_name,
            resource_tags['user_sql_endpoint_id'] as warehouse_id,
            resource_tags['user_databricks_instance_pool_id'] as instance_pool_id,
            COALESCE(
                resource_tags['user_job_id'],
                CASE 
                    WHEN resource_tags['user_cluster_name'] LIKE 'job-%'
                        THEN split(resource_tags['user_cluster_name'], '-')[1]
                END) AS job_id,

            --Cost attributes
            line_item_currency_code as currency_code,
            CASE
                    WHEN savings_plan_savings_plan_a_r_n <> '' THEN 'SavingsPlan'
                    WHEN reservation_reservation_a_r_n <> '' THEN 'Reserved'
                    WHEN line_item_usage_type LIKE '%Spot%' THEN 'Spot'
                    ELSE 'OnDemand'
            END AS pricing_term,
            pricing_unit,
            reservation_reservation_a_r_n as reservation_id,
            savings_plan_savings_plan_a_r_n as savings_plan_id,
            
            --Cost
            SUM(line_item_usage_amount) as usage_amount,  
            SUM(discount_total_discount) as discount,
            --we 0 out unblended for SP? Otherwise for SP we show OD usage since we cannot tag the SP Negation; RI is already 0ed out since the actual cost comes in RIFee
            SUM(
                CASE
                    WHEN line_item_line_item_type = 'Usage' THEN line_item_unblended_cost
                    ELSE 0
                END
            ) AS unblended_cost,  
            SUM(line_item_net_unblended_cost) as net_unblended_cost,  
            SUM(
                CASE
                    WHEN line_item_line_item_type = 'SavingsPlanCoveredUsage' THEN savings_plan_savings_plan_effective_cost
                    WHEN line_item_line_item_type = 'DiscountedUsage' THEN reservation_effective_cost
                    WHEN line_item_line_item_type = 'Usage' THEN line_item_unblended_cost
                    ELSE 0
                END
            ) AS amortized_cost,
            SUM(
                CASE
                    WHEN line_item_line_item_type = 'SavingsPlanCoveredUsage' THEN savings_plan_net_savings_plan_effective_cost
                    WHEN line_item_line_item_type = 'DiscountedUsage' THEN reservation_net_effective_cost
                    WHEN line_item_line_item_type = 'Usage' THEN line_item_net_unblended_cost
                    ELSE 0
                END
            ) AS net_amortized_cost
            FROM {bronze_tbl}
            WHERE billing_period = '{billing_period}'
            GROUP BY ALL
    """)

    # Overwrite only this month's partition in the target Delta table.
    (df.write.mode("overwrite")
       .option("mergeSchema","true")
       .option("replaceWhere", f"billing_period = '{billing_period}'")
       .saveAsTable(target_tbl))

     # Update the tracker table to mark this billing period as SUCCEEDED.
    spark.sql(f"""
      UPDATE {tracker_tbl}
      SET status='SILVER',
          last_error=NULL,
          last_update=current_timestamp()
      WHERE billing_period = '{billing_period}'
    """)
except Exception as e:
    err = str(e).replace("'", "''")[:2000]
    spark.sql(f"""
      UPDATE {tracker_tbl}
      SET status='FAILED',
          last_error='{err}',
          last_update=current_timestamp()
      WHERE billing_period = '{billing_period}'
    """)
    raise