In [0]:
query = """
select * from main_prod.earnings_analysis.fact_user_earnings_daily where paydate = current_date - 1
"""

df = spark.sql(query)
# display(df)

In [0]:
df_pos = df.where('total_pck_amt > 0')
# display(df_pos)

In [0]:
# df_pos.count()

In [0]:
# get userid employername and employerid


userid_emp = df_pos.select('userid','employerid').distinct()
# display(userid_emp)

In [0]:
# userid_emp.count()

In [0]:
query = """
select userid, employerid, employername, paydate, prev_paydate, total_pck_amt from main_prod.earnings_analysis.fact_user_earnings_daily where paydate is not NULL and paydate<= current_date -1 
"""

facts_df = spark.sql(query)
# display(facts_df)

In [0]:
facts_df = facts_df.distinct()
# display(facts_df)

In [0]:
facts_df_fil= facts_df.join(userid_emp, ['userid','employerid'])
# display(facts_df_fil)

In [0]:
# facts_df_fil.count()

In [0]:
# for each userid, empolyerid pair keep the last 4 paydates and corresponding total_pck_amt

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import desc

window = Window.partitionBy('userid','employerid').orderBy(desc('paydate'))
facts_df_fil_2 = facts_df_fil.withColumn('rn', row_number().over(window))
facts_df_fil_2 = facts_df_fil_2.where('rn <= 4 and total_pck_amt>0')
# display(facts_df_fil_2)
# for each userid, empolyerid pair keep the last 4 paydates and corresponding total_pck_amt


In [0]:
# facts_df_pd = facts_df_fil_2.toPandas()
# facts_df_pd.head()

In [0]:
traj_emb_df = spark.read.table("main_prod.datascience_scratchpad.traj_emb")

display(traj_emb_df)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast

joined = (
    facts_df_fil_2.join(
        broadcast(traj_emb_df),  # remove broadcast() if df1 is large
        (facts_df_fil_2.userid == traj_emb_df.userid)
        & (traj_emb_df.traj_date >= facts_df_fil_2.prev_paydate)   # for closed interval use <= below
        & (traj_emb_df.traj_date < facts_df_fil_2.paydate),
        "left"
    )
)



# display(joined)

In [0]:
# Aggregate per interval
facts_df_count = (
    joined.groupBy(facts_df_fil_2.userid, facts_df_fil_2.prev_paydate, facts_df_fil_2.paydate, facts_df_fil_2.rn)
          .agg(F.count(traj_emb_df.traj_date).alias("count"))
          .select("userid", "prev_paydate", "paydate", "rn", "count")
)

# display(facts_df_count)

In [0]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import col

def filter_by_count(prev_date, current_date, count):
    n_days = (current_date - prev_date).days
    if count >= n_days*0.6:
        return True
    else:
        return False
    
df_filtered_by_count = (
    facts_df_count
    .filter(F.udf(filter_by_count, returnType=BooleanType())(col("prev_paydate"), col("paydate"), col("count")))
)
# display(df_filtered_by_count)

In [0]:
final_facts_df = facts_df_fil_2.join(df_filtered_by_count, ["userid", "prev_paydate", "paydate", "rn"])
# display(final_facts_df)

In [0]:
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import col
df = (
    traj_emb_df.join(
        broadcast(final_facts_df),
        (traj_emb_df.userid == final_facts_df.userid)
        & (col("traj_date") >= col("prev_paydate"))
        & (col("traj_date") < col("paydate")),
        "inner"       # use "left" if you want to keep df2 rows without a matching interval
    )
    .select(traj_emb_df.userid, final_facts_df.employerid, final_facts_df.employername, traj_emb_df.traj_date, traj_emb_df.embedding, final_facts_df.rn)
)
display(df)

In [0]:
df_pd = df.toPandas()

In [0]:
df_pd['weekday'] = df_pd['traj_date'].apply(lambda x: x.weekday())


In [0]:
df_pd.head()

In [0]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.cluster import DBSCAN 
def apply_dbscan(embs, target_min_similarity=0.9):
    eps = 1.0 - target_min_similarity    # cosine distance threshold
    n_embs = embs.shape[0]
    db = DBSCAN(eps=eps, min_samples=max(int(n_embs*0.25), 4), metric="cosine", n_jobs=-1).fit(embs)
    return db

def get_cluster(df):
    embeddings = np.stack(df['embedding'].values)
    db = apply_dbscan(embeddings)
    return db

def cluster_exists(db):
    labels = db.labels_
    for label in labels:
        if label != -1:
            return True
    return False


def dbscan_predict_all(db, X_train, X_new):
    nn = NearestNeighbors(radius=db.eps, metric=db.metric).fit(X_train)
    dists, idxs = nn.radius_neighbors(X_new, return_distance=True)
    y = db.labels_
    pred = np.full(len(X_new), -1, dtype=int)
    for i, (di, ii) in enumerate(zip(dists, idxs)):
        if len(ii) == 0: 
            continue
        lbls, di = y[ii], di
        mask = lbls != -1
        if mask.any():
            pred[i] = lbls[mask][np.argmin(di[mask])]
    return pred
    
def works_on_weekends_fn(db, weekday_df, weekend_df):
    if len(weekday_df) == 0 or len(weekend_df) == 0:
        return False
    weekday_embs = np.stack(weekday_df['embedding'].values)
    weekend_embs = np.stack(weekend_df['embedding'].values)
    pred = dbscan_predict_all(db, weekday_embs, weekend_embs)
    # if 40% of pred is not -1, then it works on weekends
    pred_not_neg = pred[pred != -1]
    return len(pred_not_neg) > 0.4*len(pred)



In [0]:
!pip install tqdm


In [0]:
import pandas as pd
import datetime
from tqdm import tqdm

# current date
today = datetime.date.today()
visited_userid_employerid = set()
output_list = []
for i, row in tqdm(df_pd.iterrows()):
    if (row['userid'], row['employerid']) not in visited_userid_employerid:
        visited_userid_employerid.add((row['userid'], row['employerid']))
        latest_weekday_df = df_pd.loc[(df_pd['userid'] == row['userid']) & (df_pd['employerid'] == row['employerid']) & (df_pd['weekday'].isin([0, 1, 2, 3, 4])) & (df_pd['rn'] == 1)]
        latest_weekend_df = df_pd.loc[(df_pd['userid'] == row['userid']) & (df_pd['employerid'] == row['employerid']) & (df_pd['weekday'].isin([5, 6])) & (df_pd['rn'] == 1)]
        old_weekday_df = df_pd.loc[(df_pd['userid'] == row['userid']) & (df_pd['employerid'] == row['employerid']) & (df_pd['weekday'].isin([0, 1, 2, 3, 4])) & (df_pd['rn'] != 1)]
        old_weekend_df = df_pd.loc[(df_pd['userid'] == row['userid']) & (df_pd['employerid'] == row['employerid']) & (df_pd['weekday'].isin([5, 6])) & (df_pd['rn'] != 1)]
        # print(row['userid'], row['employerid'],len(latest_weekday_df), len(latest_weekend_df), len(old_weekday_df), len(old_weekend_df))
        if len(latest_weekday_df) == 0:
            continue
        old_latest_cluster = get_cluster(pd.concat([latest_weekday_df, old_weekday_df]))
        if cluster_exists(old_latest_cluster):
            work_type = "static"
            works_on_weekends = works_on_weekends_fn(old_latest_cluster, pd.concat([latest_weekday_df, old_weekday_df]), pd.concat([latest_weekend_df, old_weekend_df]))
        else:
            old_latest_cluster = get_cluster(pd.concat([latest_weekday_df, latest_weekend_df, old_weekday_df, old_weekend_df]))
            if cluster_exists(old_latest_cluster):
                work_type = "static"
                works_on_weekends = True
            else:
                latest_cluster = get_cluster(pd.concat([latest_weekday_df, latest_weekend_df]))
                if cluster_exists(latest_cluster):
                    work_type = "static"
                    works_on_weekends = works_on_weekends_fn(latest_cluster, latest_weekday_df, latest_weekend_df)
                else:
                    latest_cluster = get_cluster(pd.concat([latest_weekday_df, latest_weekend_df]))
                    if cluster_exists(latest_cluster):
                        work_type = "static"
                        works_on_weekends = True
                    else:
                        work_type = "moving"
                        rks_on_weekends = False
        output = {
            "userid": row['userid'],
            "employerid": row['employerid'],
            "employername": row['employername'],
            "predicted_work_type": work_type,
            "predicted_on": today,
            "works_on_weekends": works_on_weekends
        }
        output_list.append(output)

            
            



In [0]:
len(output_list)

In [0]:
# create a df from output_list

output_df = spark.createDataFrame(output_list)

display(output_df)

In [0]:
from pyspark.sql.types import IntegerType

output_df = output_df.withColumn("userid", col("userid").cast(IntegerType()))
output_df = output_df.withColumn("employerid", col("employerid").cast(IntegerType()))

In [0]:
display(output_df)

In [0]:
output_df.where('predicted_work_type = "moving"').count()

In [0]:
# userid = 23122754
# df_pd[df_pd['userid']==userid].sort_values("traj_date").reset_index(drop=True)

In [0]:
# test_df = df_pd[df_pd['userid']==userid].sort_values("traj_date").reset_index(drop=True)
# embs = np.stack(test_df['embedding'].to_list())
# target_min_similarity = 0.9
# eps = 1.0 - target_min_similarity    # cosine distance threshold
# n_embs = embs.shape[0]
# db = DBSCAN(eps=eps, min_samples=5, metric="cosine", n_jobs=-1).fit(embs)
# labels = db.labels_
# print(labels)

In [0]:
# # print cosine similarity between all
# from sklearn.metrics.pairwise import cosine_similarity
# for i in range(n_embs):
#     for j in range(i+1, n_embs):
#         print(test_df['traj_date'][i], test_df['traj_date'][j], cosine_similarity(embs[i].reshape(1,-1), embs[j].reshape(1,-1)))

In [0]:
# # # create empty table in df

# from pyspark.sql.types import *
# from pyspark.sql.functions import *

# schema = StructType([
#     StructField("userid", IntegerType(), True),
#     StructField("employerid", IntegerType(), True),
#     StructField("employername", StringType(), True),
#     StructField("predicted_work_type", StringType(), True),
#     StructField("predicted_on", DateType(), True),
#     StructField("works_on_weekends", BooleanType(), True)
# ])

# empty_df = spark.createDataFrame([], schema)

# empty_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.static_moving_worktype")

In [0]:
# output_df

In [0]:
output_df.write.mode("append").saveAsTable("main_prod.datascience_scratchpad.static_moving_worktype")

In [0]:
# display(spark.read.table("main_prod.datascience_scratchpad.static_moving_worktype"))