In [0]:
df = spark.sql(f"SELECT * FROM dev.job_prospects.job_1900_silver")
df.display()

In [0]:
from pyspark.sql.functions import current_date, col, datediff, max as spark_max

# Get the max timestamp value
max_timestamp = df.agg(spark_max("timestamp").alias("max_ts")).collect()[0]["max_ts"]

# Filter last 30 days
df_recent = df.filter(
    (datediff(current_date(), col("posted_at")) <= 30) &
    (col("timestamp") == max_timestamp)
)

In [0]:
from pyspark.sql.functions import max as spark_max

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("timestamp").orderBy(col("timestamp").desc())

df_top10 = (
    df_recent.groupBy("timestamp", "job_field")
    .agg({"quantity": "sum"})
    .withColumnRenamed("sum(quantity)", "job_count")
    .sort(col("timestamp").desc())
    .withColumn(
        "row_num",
        row_number().over(
            Window.partitionBy("timestamp").orderBy(col("job_count").desc())
        ),
    )
    .filter(col("row_num") <= 10)
    .drop("row_num")
    .sort(col("timestamp").desc(), col("job_count").desc())
)

display(df_top10)

df_top10.write.mode("overwrite").option("overwriteSchema", True).saveAsTable(
    "dev.job_prospects.fct_top_10_recent_job"
)