In [None]:
import sys
sys.path.append("..")
sys.path.append("../model")
from trajcl import TrajCL
from config import Config

In [None]:
conf = Config()

conf.dataset = 'nyc'
conf.post_value_updates()

In [None]:
model = TrajCL()

In [None]:
model

In [None]:
import pandas as pd
test_df = pd.read_parquet("/home/sagemaker-user/TrajCL/data/parquet_files/test/nyc_df_v3_with_time/traj_test_df_v3_with_ts.parquet")

In [None]:
userids = test_df['userid'].unique()

In [None]:
import torch
device = torch.device("cuda:0")
checkpoint_file = "/home/sagemaker-user/TrajCL/exp/v2.2/nyc_TrajCL_best.pt"
checkpoint = torch.load(checkpoint_file)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

In [None]:
from utils.traj import *
import pickle

from torch.nn.utils.rnn import pad_sequence
embs = pickle.load(open("/home/sagemaker-user/TrajCL/data/nyc_cell250_embdim256_embs.pkl", 'rb')).to('cpu').detach() # tensor
cellspace = pickle.load(open("/home/sagemaker-user/TrajCL/data/nyc_cell250_cellspace.pkl", 'rb'))

max_batch_size = 512
def infer_batch(traj, time_indices):
    traj_cell, traj_p = zip(*[merc2cell2(t, cellspace) for t in traj])
    traj_emb_p = [torch.tensor(generate_spatio_temporal_features(t, time_indices[i], cellspace)) for i, t in enumerate(traj_p)]
    traj_emb_p = pad_sequence(traj_emb_p, batch_first = False).to(device)
    traj_emb_cell = [embs[list(t)] for t in traj_cell]
    traj_emb_cell = pad_sequence(traj_emb_cell, batch_first = False).to(device)
    traj_len = torch.tensor(list(map(len, traj_cell)), dtype = torch.long, device = device)
    time_indices = pad_sequence([torch.tensor(t, dtype=torch.long) for t in time_indices], batch_first=False, padding_value=-1).to(Config.device)
    # print(traj_emb_cell, traj_emb_p, traj_len)
    traj_embs = model.interpret(traj_emb_cell.float(), traj_emb_p.float(), traj_len, time_indices)
    return traj_embs

def infer(traj, time_indices):
    if len(traj)> max_batch_size:
        traj_embs = []
        for i in range(0, len(traj), max_batch_size):
            traj_batch = traj[i:i+max_batch_size]
            time_indices_batch = time_indices[i:i+max_batch_size]
            traj_embs.append(infer_batch(traj_batch, time_indices_batch))
        return torch.cat(traj_embs, dim=0)
    else:
        return infer_batch(traj, time_indices)



In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
test_df.head()

In [None]:
test_df['time_index_list'].values[0]

In [None]:
from tqdm import tqdm
model.eval()
gt_list = []
pred_list = []
def get_gt_and_pred_label(userid):
    user_data = test_df[test_df['userid'] == userid].reset_index(drop=True)
    train_data = user_data[user_data['train_test_tag'] == 'train'].reset_index(drop=True)
    test_data = user_data[user_data['train_test_tag'] == 'test'].reset_index(drop=True)
    train_traj = train_data['merc_seq_filtered'].values
    test_traj = test_data['merc_seq_filtered'].values
    train_time_indices = train_data['time_index_list'].values
    test_time_indices = test_data['time_index_list'].values
    train_embs = infer(train_traj, train_time_indices).detach().cpu().numpy()
    test_embs = infer(test_traj, test_time_indices).detach().cpu()
    if sum(test_data['paycheck_amount'].values) > 0:
        gt_label = 1
    else:
        gt_label = 0
    pred_label = 0
    for i in range(len(test_embs)):
        test_vector = test_embs[i].unsqueeze(0)
        similarity = cosine_similarity(test_vector.numpy(), train_embs)[0]
        top_3_indices = np.argsort(similarity)[-3:][::-1]
        # print(i, top_3_indices)
        similarity = similarity[top_3_indices]
        # print(f"User: {userid}, Test Trajectory {test_data['traj_id'].values[i]}, Top 3 Train Trajectories: {train_data['traj_id'].values[top_3_indices]}, similarity: {similarity}, PCK Amount: {train_data['paycheck_amount'].values[top_3_indices]}")
        for sim, idx in zip(similarity, top_3_indices):
            if sim>0.85 and train_data['paycheck_amount'].values[idx]>0:
                pred_label = 1
                break
    return gt_label, pred_label

for userid in tqdm(userids):
    gt_label, pred_label = get_gt_and_pred_label(userid)
    gt_list.append(gt_label)
    pred_list.append(pred_label)
        


In [None]:
# % of 0 labels
sum(gt_list) / len(gt_list), sum(pred_list) / len(pred_list)

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(gt_list, pred_list)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
cm = confusion_matrix(gt_list, pred_list)
# Assuming you already have the confusion matrix 'cm'
# For binary classification, cm is in the form:
# [[TN, FP],
#  [FN, TP]]

tn, fp, fn, tp = cm.ravel()
print("True Positives:", tp)
print("False Positives:", fp)
print("True Negatives:", tn)
print("False Negatives:", fn)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision and recall
precision = precision_score(gt_list, pred_list)
recall = recall_score(gt_list, pred_list)
f1 = f1_score(gt_list, pred_list)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
def get_gt_and_pred_label(userid):
    user_data = test_df[test_df['userid'] == userid].reset_index(drop=True)
    train_data = user_data[user_data['train_test_tag'] == 'train'].reset_index(drop=True)
    test_data = user_data[user_data['train_test_tag'] == 'test'].reset_index(drop=True)
    train_traj = train_data['merc_seq_filtered'].values
    test_traj = test_data['merc_seq_filtered'].values
    train_time_indices = train_data['time_index_list'].values
    test_time_indices = test_data['time_index_list'].values
    train_embs = infer(train_traj, train_time_indices).detach().cpu().numpy()
    test_embs = infer(test_traj, test_time_indices).detach().cpu()
    if sum(test_data['paycheck_amount'].values) > 0:
        gt_label = 1
    else:
        gt_label = 0
    pred_label = 0
    for i in range(len(test_embs)):
        test_vector = test_embs[i].unsqueeze(0)
        similarity = cosine_similarity(test_vector.numpy(), train_embs)[0]
        top_3_indices = np.argsort(similarity)[-3:][::-1]
        # print(i, top_3_indices)
        similarity = similarity[top_3_indices]
        print(f"User: {userid}, Test Trajectory {test_data['traj_id'].values[i]}, Top 3 Train Trajectories: {train_data['traj_id'].values[top_3_indices]}, similarity: {similarity}, PCK Amount: {train_data['paycheck_amount'].values[top_3_indices]}")
        for sim, idx in zip(similarity, top_3_indices):
            if sim>0.85 and train_data['paycheck_amount'].values[idx]>0:
                pred_label = 1
                break
    return gt_label, pred_label

In [None]:
correct_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == pred]
print("Correct Indices:", correct_indices)

In [None]:
fn_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == 1 and pred == 0]
print("False Negative Indices:", fn_indices)

In [None]:
fp_indices = [i for i, (gt, pred) in enumerate(zip(gt_list, pred_list)) if gt == 0 and pred == 1]
print("False Positive Indices:", fp_indices)

In [None]:
idx = 4
userid = userids[idx]
get_gt_and_pred_label(6542144)

In [None]:
len(test_df)

In [None]:
len(test_df[test_df['train_test_tag'] == 'test'][test_df['label']==1]['userid'].unique())

In [None]:
790/1051

In [None]:
test_df[test_df['userid']==5499415].sort_values('traj_date',ascending=False).head(50)