In [1]:
from pyspark.sql.functions import col, unix_timestamp, lit, when, row_number
from pyspark.sql.window import Window

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 3, Finished, Available, Finished)

In [None]:
df_silver = spark.read.table("carbon_emission_silver")

In [2]:
df_silver.createOrReplaceTempView("silver_view")

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 4, Finished, Available, Finished)

In [3]:
# Select data for gold_table using SQL
df_gold_new = spark.sql("""
    SELECT 
        time_from,
        time_to,
        forecast_intensity,
        actual_intensity,
        intensity_index,
        (unix_timestamp(time_to) - unix_timestamp(time_from)) / 60 AS duration_minutes,
        CASE 
            WHEN actual_intensity IS NULL THEN 'low_quality' 
            ELSE 'good_quality' 
        END AS data_quality
    FROM silver_view
""")


df_gold_new.createOrReplaceTempView("gold_new_view")

df_gold_new.show(n=200, truncate=False)

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 5, Finished, Available, Finished)

+-------------------+-------------------+------------------+----------------+---------------+----------------+------------+
|time_from          |time_to            |forecast_intensity|actual_intensity|intensity_index|duration_minutes|data_quality|
+-------------------+-------------------+------------------+----------------+---------------+----------------+------------+
|2024-12-23 00:00:00|2024-12-23 00:30:00|44                |32              |very low       |30.0            |good_quality|
|2024-12-23 00:30:00|2024-12-23 01:00:00|37                |33              |very low       |30.0            |good_quality|
|2024-12-23 01:00:00|2024-12-23 01:30:00|38                |33              |very low       |30.0            |good_quality|
|2024-12-23 01:30:00|2024-12-23 02:00:00|35                |32              |very low       |30.0            |good_quality|
|2024-12-23 02:00:00|2024-12-23 02:30:00|34                |33              |very low       |30.0            |good_quality|
|2024-12

In [4]:
# Combine existing and new data
try:
    df_gold_existing = spark.read.table("carbon_emission_gold")
    df_gold_existing.createOrReplaceTempView("gold_existing_view")

    df_gold_combined = spark.sql("""
        SELECT * FROM gold_existing_view
        UNION DISTINCT
        SELECT * FROM gold_new_view
    """)
except Exception as e:
    print("Existing gold table not found. Creating a new one.")
    df_gold_combined = df_gold_new

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 6, Finished, Available, Finished)

Existing gold table not found. Creating a new one.


In [5]:
# Define a Window Specification, Assign a Row Number and Add a New Column
window_spec = Window.orderBy("time_from")
df_gold_combined = df_gold_combined.withColumn("id", row_number().over(window_spec))

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 7, Finished, Available, Finished)

In [6]:
df_gold_combined.write.mode("append").saveAsTable("carbon_emission_gold")

StatementMeta(, 4e489f79-a876-4283-95de-1e536f5f3e09, 8, Finished, Available, Finished)