In [0]:
from pyspark.sql import functions as F

df = spark.sql(f"SELECT * FROM dev.job_prospects.job_1900_silver_inferred")
max_timestamp = df.agg(F.max("timestamp").alias("max_ts")).collect()[0]["max_ts"]
df = df.filter((F.col("timestamp") == max_timestamp))
df.display()

In [0]:
avg_salary_exp_df = (
    (
        df.withColumn(
            "minimum_experience",
            F.when(
                F.regexp_extract(F.col("minimum_experience"), r"(\d+)", 1) != "",
                F.regexp_extract(F.col("minimum_experience"), r"(\d+)", 1).cast(
                    "double"
                ),
            ),
        )
        .withColumn(
            "maximum_experience",
            F.when(
                F.regexp_extract(F.col("maximum_experience"), r"(\d+)", 1) != "",
                F.regexp_extract(F.col("maximum_experience"), r"(\d+)", 1).cast(
                    "double"
                ),
            ),
        )
        .withColumn("salary_low", F.coalesce(F.col("salary_low"), F.col("salary_high")))
        .withColumn(
            "salary_high", F.coalesce(F.col("salary_high"), F.col("salary_low"))
        )
        .withColumn(
            "avg_salary", F.round((F.col("salary_low") + F.col("salary_high")) / 2, 2)
        )
        .withColumn(
            "minimum_experience",
            F.coalesce(F.col("minimum_experience"), F.col("maximum_experience")),
        )
        .withColumn(
            "maximum_experience",
            F.coalesce(F.col("maximum_experience"), F.col("minimum_experience")),
        )
        .withColumn(
            "avg_experience",
            F.round((F.col("minimum_experience") + F.col("maximum_experience")) / 2, 2),
        )
        .sort(F.col("avg_salary").desc_nulls_last())
    )
    .select("job_field", "avg_salary", "avg_experience")
    .groupBy("job_field")
    .agg(
        F.round(F.avg("avg_salary"), 1).alias("avg_salary"),
        F.round(F.avg("avg_experience"), 1).alias("avg_experience"),
    )
)

In [0]:
avg_salary_exp_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(
    "dev.job_prospects.fct_avg_salary_exp"
)

In [0]:
avg_salary_exp_df.display()