In [0]:
sm_df = spark.read.parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/backfill_static_moving_improved_all")
display(sm_df)

In [0]:
sm_df.where('predicted_work_type="moving"').count()/sm_df.count()

In [0]:
facts_df = spark.read.table("main_prod.earnings_analysis.fact_user_earnings_daily").select('userid', 'employerid', 'paydate','prev_paydate', 'total_pck_amt').distinct()
display(facts_df)

In [0]:
facts_df_fil = facts_df.where('paydate>="2025-07-01" and total_pck_amt>0')

In [0]:
facts_df_fil.select("userid", "employerid").distinct().count()

In [0]:
combined_df = sm_df.join(facts_df_fil, ['userid','employerid', 'paydate'], 'inner')
display(combined_df)


In [0]:
dwelltime_df = spark.read.table('main_prod.ml_data.cm_work_dwell_time_v3_2')
# display(dwelltime_df)

In [0]:
from pyspark.sql.functions import dayofweek

# Remove weekends (Saturday=7, Sunday=1) from dwelltime_df
dwelltime_weekdays_df = dwelltime_df.filter((dayofweek("calc_date") != 7) & (dayofweek("calc_date") != 1))
# display(dwelltime_weekdays_df)

In [0]:
from pyspark.sql import functions as F

# Join combined_df with dwelltime_df on userid, and filter calc_date between prev_paydate and paydate
joined_df = combined_df.join(
    dwelltime_weekdays_df,
    ["userid"],
    "left"
).filter(
    (dwelltime_weekdays_df.calc_date >= combined_df.prev_paydate) &
    (dwelltime_weekdays_df.calc_date <= combined_df.paydate)
)

# Aggregate sum of timespend_work_hours for each userid, paydate, prev_paydate
agg_df = joined_df.groupBy(
    "userid", "paydate", "prev_paydate"
).agg(
    F.sum("timespend_work_hours").alias("sum_timespend_work_hours"), F.count("*").alias("days_count")
)

# Join back to combined_df to add the new column
result_df = combined_df.join(
    agg_df,
    ["userid", "paydate", "prev_paydate"],
    "left"
)

display(result_df)

In [0]:
result_df_v2 = result_df.withColumn("avg_daily_work_hours", F.col("sum_timespend_work_hours") / F.col("days_count"))
display(result_df_v2)

In [0]:
moving_df = result_df_v2.where('predicted_work_type="moving"')
static_df = result_df_v2.where('predicted_work_type="static"')

In [0]:
from pyspark.sql.functions import sum as _sum, count as _count
moving_df.agg(_sum('avg_daily_work_hours'), _count('*')).collect()[0][0] / moving_df.agg(_count('*')).collect()[0][0]



In [0]:
static_df.agg(_sum('avg_daily_work_hours'), _count('*')).collect()[0][0] / static_df.agg(_count('*')).collect()[0][0]

In [0]:
moving_df.where('avg_daily_work_hours<2').count()/moving_df.count()


In [0]:
moving_df.where('avg_daily_work_hours>3').count()/moving_df.count()

In [0]:
display(moving_df.where('avg_daily_work_hours>3'))

In [0]:
static_df.where('avg_daily_work_hours<1').count()/static_df.count()

In [0]:
display(static_df.where('avg_daily_work_hours<1'))

In [0]:
display(sm_df.where("userid  = 8946239").orderBy('paydate'))

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col, first

# Add row_number for each userid, employerid ordered by paydate descending
window_spec = Window.partitionBy("userid", "employerid").orderBy(col("paydate").desc())
df_with_rn = result_df_v2.withColumn("rn", row_number().over(window_spec))

# Select last two paychecks
last_two_df = df_with_rn.filter(col("rn") <= 2).select(
    "userid", "employerid", "paydate", "avg_daily_work_hours", "predicted_work_type", "rn"
)

# Pivot to get predicted_work_type, paydate, and avg_daily_work_hours for last two paychecks in separate columns
pivot_df = last_two_df.groupBy("userid", "employerid").pivot("rn", [1, 2]).agg(
    first("predicted_work_type").alias("predicted_work_type"),
    first("paydate").alias("paydate"),
    first("avg_daily_work_hours").alias("avg_daily_work_hours")
)

pivot_df = pivot_df \
    .withColumnRenamed("predicted_work_type_1", "predicted_work_type_last") \
    .withColumnRenamed("predicted_work_type_2", "predicted_work_type_second_last") \
    .withColumnRenamed("paydate_1", "paydate_last") \
    .withColumnRenamed("paydate_2", "paydate_second_last") \
    .withColumnRenamed("avg_daily_work_hours_1", "avg_daily_work_hours_last") \
    .withColumnRenamed("avg_daily_work_hours_2", "avg_daily_work_hours_second_last")

display(pivot_df)

In [0]:
income_df = spark.read.table("main_prod.ml_data.income_type_v2")
pivot_df_with_incometype = pivot_df.join(income_df, "userid", "inner")

In [0]:
pivod_df_fil = pivot_df_with_incometype.where("1_predicted_work_type = 2_predicted_work_type")
display(pivod_df_fil)

In [0]:
pivot_df.count(), pivod_df_fil.count()

In [0]:
moving_df_tot = pivot_df.where("1_predicted_work_type = 'moving'")
static_df_tot = pivot_df.where("1_predicted_work_type = 'static'")

moving_df_tot.count(), static_df_tot.count()

In [0]:
92619/(1058699)

In [0]:
(92619-45347), (966080-815116)

In [0]:
moving_df = pivod_df_fil.where("1_predicted_work_type = 'moving'")
static_df = pivod_df_fil.where("1_predicted_work_type = 'static'")

In [0]:
moving_df_salaried = moving_df.where("incometypeid = 1 or incometypeid = 2")
moving_df_hourly = moving_df.where("incometypeid = 3 or incometypeid = 4")
static_df_salaried = static_df.where("incometypeid = 1 or incometypeid = 2")
static_df_hourly = static_df.where("incometypeid = 3 or incometypeid = 4")

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import sum as _sum, count as _count

moving_df.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / moving_df.agg(_count('*')).collect()[0][0]

In [0]:
moving_df_salaried.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / moving_df_salaried.agg(_count('*')).collect()[0][0]

In [0]:
moving_df_hourly.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / moving_df_hourly.agg(_count('*')).collect()[0][0]

In [0]:
static_df.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / static_df.agg(_count('*')).collect()[0][0]

In [0]:
static_df_salaried.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / static_df_salaried.agg(_count('*')).collect()[0][0]

In [0]:
static_df_hourly.agg(_sum('1_avg_daily_work_hours'), _count('*')).collect()[0][0] / static_df_hourly.agg(_count('*')).collect()[0][0]

In [0]:
moving_df.count()

In [0]:
moving_df_salaried.count(), moving_df_hourly.count()

In [0]:
static_df_salaried.count(), static_df_hourly.count()

In [0]:
static_df.count()

In [0]:
moving_df_salaried.where('1_avg_daily_work_hours>3').count(), moving_df_hourly.where('1_avg_daily_work_hours>3').count()

In [0]:
static_df_salaried.where('1_avg_daily_work_hours<1').count(), static_df_hourly.where('1_avg_daily_work_hours<1').count()

In [0]:
static_df.where('1_avg_daily_work_hours<1').count()/static_df.count()

In [0]:
display(static_df.where('1_avg_daily_work_hours<1').where('userid = 4852'))

In [0]:
# 21992186, good enough to be static, dwell time issue
# 20831911, can be static, 
# 4515091, static, dwelltime issue
# 5093908, static, dwelltime issue
# 18205899, static, different location, dwelltime issue
# 16599394, static
# 1600742, static, dwelltime issue
# 6645060, static, dwelltime issue
# 17049707, difficult to say static moving, seems location is not static becuase of gps points.


