In [0]:
df = spark.read.table('main_prod.ml_data.static_moving_worktype')

userids = spark.read.table('main_prod.earnings_analysis.fact_user_earnings_daily').select('userid').distinct().collect()
userids = list(set([x.userid for x in userids]))

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from datetime import datetime
start_date_str = "2024-03-01"
end_date_str = "2025-10-01"

date_df = spark.range(1).select(
    F.explode(
        F.sequence(F.lit(start_date_str).cast("date"), F.lit(end_date_str).cast("date"), F.expr("interval 1 day"))
    ).alias("paydate")
)

display(date_df)


In [0]:
user_df = spark.createDataFrame(userids, ["userid"])
display(user_df)

In [0]:
full_calendar = user_df.crossJoin(date_df)
display(full_calendar)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from datetime import datetime

# min_date = F.lit('2024-07-01')
# max_date = F.lit('2025-07-01')
# # df: userid (string), date (date), feat (numeric)

# 1) Build full date range per user
date_bounds = df.groupBy("userid").agg(
    F.min("paydate").alias("min_date"),
    F.max("paydate").alias("max_date")
)

full_calendar_present = date_bounds.select(
    "userid",
    F.explode(
        F.sequence(F.col("min_date"), F.col("max_date"), F.expr("interval 1 day"))
    ).alias("paydate")
)

# 2) Left join to bring existing feats
full_df = full_calendar_present.join(df, ["userid", "paydate"], "left")

display(full_df)

In [0]:
w = (Window
     .partitionBy("userid")
     .orderBy("paydate")
     .rowsBetween(Window.unboundedPreceding, 0))

filled_df_present = full_df.withColumn(
    "predicted_work_type_filled",
    F.last("predicted_work_type", ignorenulls=True).over(w)
).select("userid", "paydate", F.col("predicted_work_type_filled").alias("predicted_work_type"))

display(filled_df_present)

In [0]:
filled_df = full_calendar.join(filled_df_present, ["userid", "paydate"], "left")
display(filled_df)

In [0]:
# delete table main_prod.datascience_scratchpad.static_moving_features
spark.sql("DROP TABLE IF EXISTS main_prod.ml_features.static_moving_features")

In [0]:
# replace static with 1 and moving with 0 in filled df
from pyspark.sql import functions as F

final_df = filled_df.withColumn(
    "is_static",
    F.when(
        F.col("predicted_work_type") == "static", 1
    ).when(
        F.col("predicted_work_type") == "moving", 0
    ).otherwise(None)
).drop("predicted_work_type").withColumnRenamed("paydate", "predtime")

# final_df = final_df.withColumn("userid", F.col("userid").cast("int"))
# final_df = final_df.withColumn("predtime", F.col("predtime").cast("string"))
display(final_df)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

w_7rows = Window.partitionBy("userid").orderBy("predtime").rowsBetween(-7, 0)
w_14rows = Window.partitionBy("userid").orderBy("predtime").rowsBetween(-14, 0)
w_30rows = Window.partitionBy("userid").orderBy("predtime").rowsBetween(-30, 0)

final_df_v2 = final_df.withColumn(
    "is_static_7d",
    F.mean("is_static").over(w_7rows)
).withColumn(
    "is_static_14d",
    F.mean("is_static").over(w_14rows)
).withColumn(
    "is_static_30d",
    F.mean("is_static").over(w_30rows)
)

# if is_static is null, all other cols should be null
final_df_v3 = final_df_v2.withColumn(
    "is_static_7d",
    F.when(F.col("is_static").isNull(), None).otherwise(F.col("is_static_7d"))
).withColumn(
    "is_static_14d",
    F.when(F.col("is_static").isNull(), None).otherwise(F.col("is_static_14d"))
).withColumn(
    "is_static_30d",
    F.when(F.col("is_static").isNull(), None).otherwise(F.col("is_static_30d"))
)



In [0]:
display(final_df_v3.where('userid = 996'))

In [0]:
final_df_v3 = final_df_v3.withColumn("userid", F.col("userid").cast("int"))
final_df_v3 = final_df_v3.withColumn("predtime", F.col("predtime").cast("string"))

In [0]:
final_df_v3.write.mode("overwrite").saveAsTable("main_prod.ml_features.static_moving_features")

In [0]:
%sql
ALTER TABLE main_prod.ml_features.static_moving_features ALTER COLUMN userid SET NOT NULL;
ALTER TABLE main_prod.ml_features.static_moving_features ALTER COLUMN predtime SET NOT NULL;
ALTER TABLE main_prod.ml_features.static_moving_features ADD CONSTRAINT static_moving_features_pk PRIMARY KEY( userid, predtime );