In [0]:
import sys
sys.path.append("..")
sys.path.append("../model")
from trajcl import TrajCL
from config import Config

In [0]:
conf = Config()

conf.dataset = 'nyc'
conf.post_value_updates()

In [0]:
model = TrajCL()

In [0]:
model

In [0]:
import pandas as pd
test_df = pd.read_parquet("/home/sagemaker-user/TrajCL/data/parquet_files/test/nyc_df_v3_with_time/traj_test_df_v3_with_ts.parquet")

In [0]:
userids = test_df['userid'].unique()

In [0]:
import torch
device = torch.device("cuda:0")
checkpoint_file = "/home/sagemaker-user/TrajCL/exp/v2.1/nyc_TrajCL_best.pt"
checkpoint = torch.load(checkpoint_file)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

In [0]:
from utils.traj import *
import pickle

from torch.nn.utils.rnn import pad_sequence
embs = pickle.load(open("/home/sagemaker-user/TrajCL/data/nyc_cell250_embdim256_embs.pkl", 'rb')).to('cpu').detach() # tensor
cellspace = pickle.load(open("/home/sagemaker-user/TrajCL/data/nyc_cell250_cellspace.pkl", 'rb'))
max_batch_size = 512

def infer_batch(traj, time_indices):
    traj_cell, traj_p = zip(*[merc2cell2(t, cellspace) for t in traj])
    traj_emb_p = [torch.tensor(generate_spatial_features(t, cellspace)) for t in traj_p]
    traj_emb_p = pad_sequence(traj_emb_p, batch_first = False).to(device)
    traj_emb_cell = [embs[list(t)] for t in traj_cell]
    traj_emb_cell = pad_sequence(traj_emb_cell, batch_first = False).to(device)
    traj_len = torch.tensor(list(map(len, traj_cell)), dtype = torch.long, device = device)
    time_indices = pad_sequence([torch.tensor(t, dtype=torch.long) for t in time_indices], batch_first=False, padding_value=-1).to(Config.device)
    # print(traj_emb_cell, traj_emb_p, traj_len)
    traj_embs = model.interpret(traj_emb_cell.float(), traj_emb_p.float(), traj_len, time_indices)
    return traj_embs

def infer(traj, time_indices):
    if len(traj)> max_batch_size:
        traj_embs = []
        for i in range(0, len(traj), max_batch_size):
            traj_batch = traj[i:i+max_batch_size]
            time_indices_batch = time_indices[i:i+max_batch_size]
            traj_embs.append(infer_batch(traj_batch, time_indices_batch))
        return torch.cat(traj_embs, dim=0)
    else:
        return infer_batch(traj, time_indices)



In [0]:
from sklearn.metrics.pairwise import cosine_similarity


In [0]:
test_df.head()

In [0]:
test_df['time_index_list'].values[0]

In [0]:
from tqdm import tqdm
model.eval()
gt_list = []
pred_list = []
def get_gt_and_pred_label(userid):
    user_data = test_df[test_df['userid'] == userid].reset_index(drop=True)
    train_data = user_data[user_data['train_test_tag'] == 'train'].reset_index(drop=True)
    test_data = user_data[user_data['train_test_tag'] == 'test'].reset_index(drop=True)
    train_traj = train_data['merc_seq_filtered'].values
    test_traj = test_data['merc_seq_filtered'].values
    train_time_indices = train_data['time_index_list'].values
    test_time_indices = test_data['time_index_list'].values
    train_embs = infer(train_traj, train_time_indices).detach().cpu().numpy()
    test_embs = infer(test_traj, test_time_indices).detach().cpu()
    if sum(test_data['paycheck_amount'].values) > 0:
        gt_label = 1
    else:
        gt_label = 0
    pred_label = 0
    for i in range(len(test_embs)):
        test_vector = test_embs[i].unsqueeze(0)
        similarity = cosine_similarity(test_vector.numpy(), train_embs)[0]
        top_3_indices = np.argsort(similarity)[-3:][::-1]
        # print(i, top_3_indices)
        similarity = similarity[top_3_indices]
        # print(f"User: {userid}, Test Trajectory {test_data['traj_id'].values[i]}, Top 3 Train Trajectories: {train_data['traj_id'].values[top_3_indices]}, similarity: {similarity}, PCK Amount: {train_data['paycheck_amount'].values[top_3_indices]}")
        for sim, idx in zip(similarity, top_3_indices):
            if sim>0.85 and train_data['paycheck_amount'].values[idx]>0:
                pred_label = 1
                break
    return gt_label, pred_label

for userid in tqdm(userids):
    gt_label, pred_label = get_gt_and_pred_label(userid)
    gt_list.append(gt_label)
    pred_list.append(pred_label)
    # break
        


In [0]:
# % of 0 labels
sum(gt_list) / len(gt_list), sum(pred_list) / len(pred_list)

In [0]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(gt_list, pred_list)
print("Accuracy:", accuracy)

In [0]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
cm = confusion_matrix(gt_list, pred_list)
# Assuming you already have the confusion matrix 'cm'
# For binary classification, cm is in the form:
# [[TN, FP],
#  [FN, TP]]

tn, fp, fn, tp = cm.ravel()
print("True Positives:", tp)
print("False Positives:", fp)
print("True Negatives:", tn)
print("False Negatives:", fn)

In [0]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision and recall
precision = precision_score(gt_list, pred_list)
recall = recall_score(gt_list, pred_list)
f1 = f1_score(gt_list, pred_list)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [0]:
def get_gt_and_pred_label(userid):
    user_data = test_df[test_df['userid'] == userid].reset_index(drop=True)
    train_data = user_data[user_data['train_test_tag'] == 'train'].reset_index(drop=True)
    test_data = user_data[user_data['train_test_tag'] == 'test'].reset_index(drop=True)
    train_traj = train_data['merc_seq_filtered'].values
    test_traj = test_data['merc_seq_filtered'].values
    train_time_indices = train_data['time_index_list'].values
    test_time_indices = test_data['time_index_list'].values
    train_embs = infer(train_traj, train_time_indices).detach().cpu().numpy()
    test_embs = infer(test_traj, test_time_indices).detach().cpu()
    if sum(test_data['paycheck_amount'].values) > 0:
        gt_label = 1
    else:
        gt_label = 0
    pred_label = 0
    for i in range(len(test_embs)):
        test_vector = test_embs[i].unsqueeze(0)
        similarity = cosine_similarity(test_vector.numpy(), train_embs)[0]
        top_3_indices = np.argsort(similarity)[-10:][::-1]
        bottom_3_indices = np.argsort(similarity)[:10]
        # print(i, top_3_indices)
        top_similarity = similarity[top_3_indices]
        bottom_similarity = similarity[bottom_3_indices]
        print(f"User: {userid}, Test Trajectory {test_data['traj_id'].values[i]}, Top 3 Train Trajectories: {train_data['traj_id'].values[top_3_indices]}, similarity: {top_similarity}, PCK Amount: {train_data['paycheck_amount'].values[top_3_indices]}")
        print(f"Bottom 3 Train Trajectories: {train_data['traj_id'].values[bottom_3_indices]}, similarity: {bottom_similarity}, PCK Amount: {train_data['paycheck_amount'].values[bottom_3_indices]}")
        for sim, idx in zip(similarity, top_3_indices):
            if sim>0.85 and train_data['paycheck_amount'].values[idx]>0:
                pred_label = 1
                break
    return gt_label, pred_label

In [0]:
correct_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == pred]
print("Correct Indices:", correct_indices)

In [0]:
fn_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == 1 and pred == 0]
print("False Negative Indices:", fn_indices)

In [0]:
fp_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == 0 and pred == 1]
print("False Positive Indices:", fp_indices)

In [0]:
idx = 37
userid = userids[idx]
get_gt_and_pred_label(userid)

In [0]:
'''
CCC problematic user
14763364
'''

In [0]:
len(test_df)

In [0]:
len(test_df[test_df['train_test_tag'] == 'test'][test_df['label']==1]['userid'].unique())

In [0]:
790/1051

In [0]:
test_df[test_df['userid']==8320675].sort_values('traj_date',ascending=False).head(50)

In [0]:
def similarity_of_trajids(trajid_1, trajid_2):
    traj_1 = test_df[test_df['traj_id'] == trajid_1]['merc_seq_filtered'].values[0]
    traj_2 = test_df[test_df['traj_id'] == trajid_2]['merc_seq_filtered'].values[0]
    time_indices_1 = test_df[test_df['traj_id'] == trajid_1]['time_index_list'].values[0]
    time_indices_2 = test_df[test_df['traj_id'] == trajid_2]['time_index_list'].values[0]
    emb_1 = infer([traj_1], [time_indices_1]).detach().cpu().numpy()
    emb_2 = infer([traj_2], [time_indices_2]).detach().cpu().numpy()
    return cosine_similarity(emb_1, emb_2)[0][0]

In [0]:
similarity_of_trajids("8320675_2025-03-15", "8320675_2025-06-12")

In [0]:
num_test_trajs_list = []
for idx in fn_indices:
    userid = userids[idx]
    num_test_trajs = len(test_df[test_df['train_test_tag'] == 'test'][test_df["userid"]==userid])
    num_test_trajs_list.append(num_test_trajs)


In [0]:
# plot histogram of num_test_trajs_list where x axis is the number of test trajectories and y axis is the frequency
count = 0
for traj_count in num_test_trajs_list:
    if traj_count >=3:
        count+=1

len(num_test_trajs_list), count, count/len(num_test_trajs_list)

In [0]:
%sql
select * from main_prod.ml_features.earnings_dwell_time_features

In [0]:
df = spark.read.table('main_prod.ml_features.earnings_dwell_time_features')

display(df)

In [0]:
df

In [0]:
query = """
select count(*) from main_prod.ml_features.earnings_dwell_time_features where predtime = '2025-10-17'
"""
display(spark.sql(query))


In [0]:
%sql
select * from main_prod.ml_data.static_moving_worktype

In [0]:
df = spark.read.table('main_prod.ml_data.static_moving_worktype')

In [0]:
# get distinct userids from 3 tables main_prod.datascience_scratchpad.earnings_if_v6_dev_train, main_prod.datascience_scratchpad.earnings_if_v6_dev_val, main_prod.datascience_scratchpad.earnings_if_v6_dev_test

# userids = spark.read.table('main_prod.datascience_scratchpad.earnings_if_v6_dev_train').select('userid').distinct().collect()
# userids += spark.read.table('main_prod.datascience_scratchpad.earnings_if_v6_dev_val').select('userid').distinct().collect()
# userids += spark.read.table('main_prod.datascience_scratchpad.earnings_if_v6_dev_test').select('userid').distinct().collect()
# userids = list(set([x.userid for x in userids]))

userids = spark.read.table('main_prod.earnings_analysis.fact_user_earnings_daily').select('userid').distinct().collect()
userids = list(set([x.userid for x in userids]))



In [0]:
len(userids)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from datetime import datetime
start_date_str = "2024-03-01"
end_date_str = "2025-10-01"

date_df = spark.range(1).select(
    F.explode(
        F.sequence(F.lit(start_date_str).cast("date"), F.lit(end_date_str).cast("date"), F.expr("interval 1 day"))
    ).alias("paydate")
)

display(date_df)


In [0]:
user_df = spark.createDataFrame(userids, ["userid"])
display(user_df)

In [0]:
full_calendar = user_df.crossJoin(date_df)
display(full_calendar)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window
from datetime import datetime

# min_date = F.lit('2024-07-01')
# max_date = F.lit('2025-07-01')
# # df: userid (string), date (date), feat (numeric)

# 1) Build full date range per user
date_bounds = df.groupBy("userid").agg(
    F.min("paydate").alias("min_date"),
    F.max("paydate").alias("max_date")
)

full_calendar_present = date_bounds.select(
    "userid",
    F.explode(
        F.sequence(F.col("min_date"), F.col("max_date"), F.expr("interval 1 day"))
    ).alias("paydate")
)

# 2) Left join to bring existing feats
full_df = full_calendar_present.join(df, ["userid", "paydate"], "left")

display(full_df)

In [0]:

# 3) Carry forward last known past value (LOCF)
w = (Window
     .partitionBy("userid")
     .orderBy("paydate")
     .rowsBetween(Window.unboundedPreceding, 0))

filled_df_present = full_df.withColumn(
    "predicted_work_type_filled",
    F.last("predicted_work_type", ignorenulls=True).over(w)
).select("userid", "paydate", F.col("predicted_work_type_filled").alias("predicted_work_type"))

display(filled_df_present)

# 'filled_df' now has every date per userid;
# for any gap, feat is the last available past value (or null if none yet).

In [0]:
display(filled_df_present.where('userid = 118'))

In [0]:
filled_df = full_calendar.join(filled_df_present, ["userid", "paydate"], "left")
display(filled_df)

In [0]:
# delete table main_prod.datascience_scratchpad.static_moving_features
spark.sql("DROP TABLE IF EXISTS main_prod.ml_features.static_moving_features")

In [0]:
# replace static with 1 and moving with 0 in filled df
from pyspark.sql import functions as F

filled_df = filled_df.withColumn(
    "is_static",
    F.when(
        F.col("predicted_work_type") == "static", 1
    ).when(
        F.col("predicted_work_type") == "moving", 0
    ).otherwise(None)
).drop("predicted_work_type").withColumnRenamed("paydate", "predtime")

filled_df = filled_df.withColumn("userid", df["userid"].cast("int"))
filled_df = filled_df.withColumn("predtime", df["predtime"].cast("string"))
display(filled_df.where('userid = 118'))

In [0]:
filled_df.write.mode("overwrite").saveAsTable("main_prod.ml_features.static_moving_features")

In [0]:
df = spark.read.table("main_prod.ml_features.static_moving_features")
display(df)

In [0]:
df = df.withColumn("userid", df["userid"].cast("int"))
df = df.withColumn("predtime", df["predtime"].cast("string"))

In [0]:
%sql
ALTER TABLE main_prod.ml_features.static_moving_features ALTER COLUMN userid TYPE INT

In [0]:
%sql
ALTER TABLE main_prod.ml_features.static_moving_features ALTER COLUMN userid SET NOT NULL;
ALTER TABLE main_prod.ml_features.static_moving_features ALTER COLUMN predtime SET NOT NULL;
ALTER TABLE main_prod.ml_features.static_moving_features ADD CONSTRAINT static_moving_features_pk PRIMARY KEY( userid, predtime );



In [0]:
%sql
describe extended main_prod.ml_features.static_moving_features

In [0]:
%sql
SHOW TBLPROPERTIES main_prod.ml_features.static_moving_features('primary_key');


In [0]:
# cast predtime col to string
df = df.withColumn("predtime", df["predtime"].cast("string"))
df

In [0]:
df

In [0]:
df = df.withColumnRenamed("preddate", 'predtime')
display(df)

In [0]:
static_moving_df = spark.read.table('main_prod.ml_features.static_moving_features')
facts_table = spark.read.table("main_prod.earnings_analytics.facts_us

In [0]:
%sql
select * from main_prod.datascience_scratchpad.earnings_if_v6_dev_train

In [0]:
query = """
select df2.userid, df2.prediction_day, df1.predicted_work_type from main_prod.datascience_scratchpad.static_moving_features df1
right JOIN main_prod.datascience_scratchpad.earnings_if_v6_dev_train df2
on df1.userid = df2.userid and df1.paydate = df2.prediction_day where df2.userid = 118
"""
display(spark.sql(query))

In [0]:
filled_df = spark.read.table("main_prod.datascience_scratchpad.static_moving_features")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# filled_df: userid, date (date), feat (string/categorical)

# 30-day rolling window per user, past only (includes current day)
w30 = (Window
       .partitionBy("userid")
       .orderBy(F.col("paydate"))
       .rowsBetween(-29, 0))

out = (
    filled_df
    # Collect last 30 categorical values per user
    .withColumn("win_vals_raw", F.collect_list("predicted_work_type").over(w30))
    # Filter out nulls in the collected window
    .withColumn("win_vals", F.expr("filter(win_vals_raw, x -> x is not null)"))
    # Build a frequency map of non-null values
    .withColumn(
        "counts",
        F.expr("""
          aggregate(
            win_vals,
            CAST(map() AS map<string,int>),
            (acc, x) -> map_concat(acc, map(x, coalesce(element_at(acc, x), 0) + 1))
          )
        """)
    )
    # Extract the map keys (unique categorical values)
    .withColumn("keys", F.map_keys("counts"))
    # For tie-breaking, also compute last occurrence index of each value
    .withColumn(
        "rank_arr",
        F.expr("""
          transform(
            keys,
            k -> struct(
              element_at(counts, k) AS cnt,
              size(win_vals) - array_position(reverse(win_vals), k) AS last_idx,
              k AS k
            )
          )
        """)
    )
    # Sort by count desc, then most recent occurrence desc; pick top elementâ€™s key
    .withColumn(
        "predicted_work_type_mode_30d",
        F.expr("""
          element_at(
            transform(
              array_sort(
                transform(rank_arr, x -> struct(-x.cnt AS a, -x.last_idx AS b, x.k AS k))
              ),
              y -> y.k
            ),
            1
          )
        """)
    )
    .select("userid", "paydate", "predicted_work_type", "predicted_work_type_mode_30d")
)

In [0]:
display(out)

In [0]:
df = spark.read.format('parquet').load("s3://ml-datasets-datalakeprod-us-west-2-sagemaker/earnings_if_v6_dev/12/train")
display(df)

In [0]:
df.count()

In [0]:
df2 = df.where('is_static is not null')
display(df2)

In [0]:
df2.write.mode('overwrite').parquet("s3://ml-datasets-datalakeprod-us-west-2-sagemaker/earnings_if_v6_dev/3/train_static")

In [0]:
df.where('dwell_time_pct_change_7d_30d is null').count()/df.count()

In [0]:
df.createOrReplaceTempView("earnings_if_v6_dev_train")


In [0]:
query = """
select userid, predtime, is_static from earnings_if_v6_dev_train
"""
display(spark.sql(query))

In [0]:
%sql
select * from main_prod.ml_data.static_moving_worktype where userid = 7847

In [0]:
static_moving_df = spark.read.table("main_prod.ml_data.static_moving_worktype")

static_moving_df_userids = static_moving_df.select("userid").distinct()
train_userids = df.select("userid").distinct()

intersection = train_userids.join(static_moving_df_userids, "userid", "inner").count()
print(intersection)

In [0]:
static_moving_df_userids.count(), train_userids.count(), intersection

In [0]:
718905/2728375

In [0]:
df

In [0]:
df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.earnings_if_v6_dev_v10_test_100pct_static_moving_fil")

In [0]:
%sql
select * from main_prod.datascience_scratchpad.earnings_if_v6_dev_test_100pct_static_moving_unknown_clf_metrics_98_0

In [0]:
%sql
select * from main_prod.earnings_analysis.fact_user_earnings_daily where userid = 10556051 order by prediction_day desc

In [0]:
%sql
select * from main_prod.bank_transactions.bank_transactions_v2 where userid = 10556051 order by postedon desc

In [0]:
# convert epoch to utc date
epoch = 1757204518875
from datetime import datetime
date = datetime.utcfromtimestamp(epoch / 1000)
print(date)

In [0]:
%sql
select * from main_prod.datascience_scratchpad.earnings_if_v6_dev_v10_test_100pct_static_moving_fil_unknown where userid = 22151029

In [0]:
%sql
select * from main_prod.datascience_scratchpad.fact_user_earnings_daily_static_moving_fil where userid= 22151029

In [0]:
%sql
select * from main_prod.datascience_scratchpad.earnings_if_v6_dev_v10_test_100pct_static_moving_fil where userid = 22151029

In [0]:
%sql
select count(*) from main_prod.ml_data.traj_data

In [0]:
df  = spark.read.table("main_prod.datascience_scratchpad.jatin_data_analysis_df_v4")
display(df)

In [0]:
df.count()

In [0]:
df_v2  = spark.read.table("main_prod.datascience_scratchpad.jatin_data_analysis_df_v5")
display(df_v2)

In [0]:
df_v2.count()

In [0]:
2665112/3536693

In [0]:
%sql
select count(*) from main_prod.ml_data.traj_emb

In [0]:
%sql
select * from main_prod.ml_data.traj_emb where userid = 15646982 order by traj_date

In [0]:
df = spark.read.parquet("s3://ml-datasets-datalakeprod-us-west-2-sagemaker/earnings_if_v6_dev/12/test_100pct")
display(df)

In [0]:
df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.earnings_if_v6_dev_12_test_100pct")

In [0]:
%sql
select * from main_prod.earnings_analysis.fact_user_earnings_daily where userid  = 996

In [0]:
df = spark.read.parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/backfill_traj_emb_cond_relaxed")
df.count()

In [0]:
display(df.where('userid = 15646982').orderBy('traj_date'))

In [0]:
df = spark.reahttps://earnin-earnin-prod.cloud.databricks.com/editor/notebooks/1810295871363613?o=27254898090246$0d.parquet("s3://ml-datasets-datalakeprod-us-west-2-sagemaker/earnings_if_v6_dev/13/test_100pct")
display(df)


In [0]:
df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.earnings_if_v6_dev_13_test_100pct")

In [0]:
df = spark.read.parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/backfill_static_moving_relax_condition_v2")
display(df)

In [0]:
df.count()

In [0]:
df.where(" predicted_work_type = 'moving'").count()

In [0]:
df.where(" predicted_work_type = 'moving'").count()/df.count()

In [0]:
%sql
select count(*) from main_prod.ml_data.static_moving_worktype

In [0]:
33742658/13151282