In [0]:
# Feature Eng

import pyspark.sql.functions as F
from pyspark.sql.window import Window
import numpy as np

# Configuración
TABLE_GOLD = "fire_risk_project.03_gold.gold_dataset_full"
TABLE_GOLD_ENRICHED = "fire_risk_project.03_gold.gold_dataset_features"

In [0]:
df = spark.read.table(TABLE_GOLD)

w_7d = Window.partitionBy("grid_id").orderBy("timestamp_clima").rowsBetween(-168, 0)
w_3d = Window.partitionBy("grid_id").orderBy("timestamp_clima").rowsBetween(-72, 0)

df_features = df.withColumn(
    # ACUMULACIÓN
    "precip_acum_7d", F.sum("precipitation").over(w_7d)
).withColumn(
    "precip_acum_3d", F.sum("precipitation").over(w_3d)
).withColumn(
    # TENDENCIAS 
    "temp_max_3d", F.max("temperature_2m").over(w_3d)
).withColumn(
    # INTERACCIONES
    "fire_spread_index", 
    (F.col("wind_speed_10m") * F.col("temperature_2m")) / (F.col("relative_humidity_2m") + 1)
)
# CICLICIDAD

df_features = df_features.withColumn(
    "hour_sin", F.sin(2 * np.pi * F.hour("timestamp_clima") / 24)
).withColumn(
    "hour_cos", F.cos(2 * np.pi * F.hour("timestamp_clima") / 24)
).withColumn(
    "month_sin", F.sin(2 * np.pi * F.month("timestamp_clima") / 12)
).withColumn(
    "month_cos", F.cos(2 * np.pi * F.month("timestamp_clima") / 12)
)

# Limpieza de nulos generados por la ventana de los primeros 7 días
df_final = df_features.na.drop()

df_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(TABLE_GOLD_ENRICHED)


In [0]:
%sql

SELECT * FROM fire_risk_project.03_gold.gold_dataset_features LIMIT 10;