In [0]:
from config_infer import InferenceConfig
cfg = InferenceConfig()

In [0]:
query = """
select userid, employerid, employername, paydate, prev_paydate, total_pck_amt from main_prod.earnings_analysis.fact_user_earnings_daily where paydate < current_date - 1 and total_pck_amt > 0
"""

facts_df = spark.sql(query)
facts_df = facts_df.distinct()

In [0]:
traj_emb_df = spark.read.table(cfg.traj_emb_table_name)

display(traj_emb_df)

In [0]:
query = """
select distinct(userid) from {} where traj_date = current_date - 1
""".format(cfg.traj_emb_table_name)

userids = spark.sql(query)
display(userids)


In [0]:
facts_df_fil = facts_df.join(userids, on='userid', how='inner')
# display(facts_df_fil)

In [0]:
# for each userid, empolyerid pair keep the last 4 paydates and corresponding total_pck_amt

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import desc

window = Window.partitionBy('userid','employerid').orderBy(desc('prev_paydate'))
facts_df_fil_2 = facts_df_fil.withColumn('rn', row_number().over(window))
facts_df_fil_2 = facts_df_fil_2.where('rn <= 4')
# display(facts_df_fil_2)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast

joined = (
    facts_df_fil_2.join(
        broadcast(traj_emb_df),  # remove broadcast() if df1 is large
        (facts_df_fil_2.userid == traj_emb_df.userid)
        & (traj_emb_df.traj_date >= facts_df_fil_2.prev_paydate)   # for closed interval use <= below
        & (traj_emb_df.traj_date < facts_df_fil_2.paydate),
        "left"
    )
)

facts_df_count = (
    joined.groupBy(facts_df_fil_2.userid, facts_df_fil_2.prev_paydate, facts_df_fil_2.paydate, facts_df_fil_2.rn)
          .agg(F.count(traj_emb_df.traj_date).alias("count"))
          .select("userid", "prev_paydate", "paydate", "rn", "count")
)

In [0]:
# display(facts_df_count)

In [0]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import col

def filter_by_count(prev_date, current_date, count):
    n_days = (current_date - prev_date).days
    if count >= n_days*0.6:
        return True
    else:
        return False
    
df_filtered_by_count = (
    facts_df_count
    .filter(F.udf(filter_by_count, returnType=BooleanType())(col("prev_paydate"), col("paydate"), col("count")))
)
# display(df_filtered_by_count)

In [0]:
final_facts_df = facts_df_fil_2.join(df_filtered_by_count, ["userid", "prev_paydate", "paydate", "rn"])
# display(final_facts_df)


In [0]:
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import col
df = (
    traj_emb_df.join(
        broadcast(final_facts_df),
        (traj_emb_df.userid == final_facts_df.userid)
        & (col("traj_date") >= col("prev_paydate"))
        & (col("traj_date") < col("paydate")),
        "inner"       # use "left" if you want to keep df2 rows without a matching interval
    )
    .select(traj_emb_df.userid, final_facts_df.employerid, final_facts_df.employername, traj_emb_df.traj_date, traj_emb_df.embedding, final_facts_df.rn)
)
# display(df)

In [0]:
# df.count()

In [0]:
df_pd = df.toPandas()

In [0]:
df_pd.head()

In [0]:
current_traj_df = traj_emb_df.where('traj_date  = current_date - 1')
# display(current_traj_df)

In [0]:
current_traj_df_pd = current_traj_df.toPandas()
current_traj_df_pd.head()

In [0]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.cluster import DBSCAN 
def apply_dbscan(embs, target_min_similarity=0.9):
    eps = 1.0 - target_min_similarity    # cosine distance threshold
    n_embs = embs.shape[0]
    db = DBSCAN(eps=eps, min_samples=max(int(n_embs*0.25), 4), metric="cosine", n_jobs=-1).fit(embs)
    return db

def get_cluster(df):
    embeddings = np.stack(df['embedding'].values)
    db = apply_dbscan(embeddings)
    return db

def cluster_exists(db):
    labels = db.labels_
    for label in labels:
        if label != -1:
            return True
    return False


def dbscan_predict_all(db, X_train, X_new):
    nn = NearestNeighbors(radius=db.eps, metric=db.metric).fit(X_train)
    dists, idxs = nn.radius_neighbors(X_new, return_distance=True)
    y = db.labels_
    pred = np.full(len(X_new), -1, dtype=int)
    for i, (di, ii) in enumerate(zip(dists, idxs)):
        if len(ii) == 0: 
            continue
        lbls, di = y[ii], di
        mask = lbls != -1
        if mask.any():
            pred[i] = lbls[mask][np.argmin(di[mask])]
    if pred[0]==-1:
        return 0
    return 1
    


In [0]:
!pip install tqdm

In [0]:
from datetime import datetime
current_time = datetime.now()
current_time

In [0]:
df_pd['weekday'] = df_pd['traj_date'].apply(lambda x: x.weekday())

In [0]:
import mlflow

# Disable all autologging
mlflow.autolog(disable=True)

# Or disable just sklearn autologging
mlflow.sklearn.autolog(disable=True)

In [0]:
from tqdm import tqdm
output_list = []
for i in tqdm(current_traj_df_pd.index):
    row = current_traj_df_pd.iloc[i]
    user_df = df_pd[df_pd.userid == row.userid]
    if len(user_df)==0:
        continue
    weekday_df = user_df[user_df.weekday < 5]
    # weekend_df = user_df[user_df.traj_date.dt.weekday >= 5]
    latest_weekday_df = weekday_df[weekday_df.rn == weekday_df.rn.min()]
    # latest_weekend_df = weekend_df[weekend_df.rn == weekend_df.rn.min()]
    db = get_cluster(weekday_df)
    # print(row.embedding)
    if cluster_exists(db):
        pred = dbscan_predict_all(db, np.stack(weekday_df.embedding.values), np.expand_dims(row.embedding, axis=0))
    else:
        db = get_cluster(latest_weekday_df)
        if cluster_exists(db):
            pred = dbscan_predict_all(db, np.stack(latest_weekday_df.embedding.values), np.expand_dims(row.embedding, axis=0))
        else:
            pred = 2
    output = {
        "userid": int(row.userid),
        "employerid": int(user_df.employerid.iloc[0]),
        "employername": user_df.employername.iloc[0],
        "traj_date": row.traj_date,
        "predicted_on": current_time,
        "went_to_work": pred,
        "model_version": row.model_version
    }
    output_list.append(output)
    # print(output)
    # break




In [0]:
# # # # create empty table in df

# from pyspark.sql.types import *
# from pyspark.sql.functions import *

# schema = StructType([
#     StructField("userid", IntegerType(), True),
#     StructField("employerid", IntegerType(), True),
#     StructField("employername", StringType(), True),
#     StructField("traj_date", DateType(), True),
#     StructField("predicted_on", TimestampType(), True),
#     StructField("went_to_work", IntegerType(), True),
#     StructField("model_version", StringType(), True),
# ])

# empty_df = spark.createDataFrame([], schema)

# empty_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.went_to_work_traj")

In [0]:
# create a df from output_list

output_df = spark.createDataFrame(output_list)

output_df.count()

In [0]:
output_df

In [0]:
from pyspark.sql.types import IntegerType

output_df = output_df.withColumn("userid", col("userid").cast(IntegerType()))
output_df = output_df.withColumn("employerid", col("employerid").cast(IntegerType()))
output_df = output_df.withColumn("went_to_work", col("went_to_work").cast(IntegerType()))

In [0]:
output_df.write.mode("append").saveAsTable(cfg.went_to_work_table_name)