In [0]:
df = spark.read.table("main_prod.datascience_scratchpad.nyc_train_filtered")
display(df)

In [0]:
query = """
select * from main_prod.datascience_scratchpad.nyc_train_filtered where traj_id rlike '16808692_*'
"""

df = spark.sql(query)
display(df)

In [0]:
df_test = df.toPandas()

In [0]:
def ts_to_min_index_list(ts_list):
    output = []
    for ts in ts_list:
        output.append(((ts.hour * 60) + ts.minute)//10)
    return output


df_test['ts_min_index'] = df_test['timestamps_filtered'].apply(ts_to_min_index_list)
df_test.head()

In [0]:
display(df_test)

In [0]:
import math
def meters2lonlat(merc_points):
    lon_lats = []
    for point in merc_points:
        x,y = point[0],point[1]
        semimajoraxis = 6378137.0
        lon = x / semimajoraxis / 0.017453292519943295
        t = math.exp(y / 3189068.5)
        lat = math.asin((t - 1) / (t + 1)) / 0.017453292519943295
        lon_lats.append([lon,lat])
    return lon_lats

In [0]:
import numpy as np
def dist(a,b):
    return np.sqrt(np.square(a[0]-b[0]) + np.square(a[1]-b[1]))

def get_imputed_merc_seq(merc_seq,ts_min_idx_list):
    merc_seq_imputed = [merc_seq[0]]
    for i in range(1,len(ts_min_idx_list)):
        if ts_min_idx_list[i] - ts_min_idx_list[i-1]<=1:
            merc_seq_imputed.append(merc_seq[i])
        else:
            n_points_to_impute = ts_min_idx_list[i] - ts_min_idx_list[i-1] - 1
            dist_between_points = dist(merc_seq[i],merc_seq[i-1]) / (n_points_to_impute + 1)        
            angle = np.arctan2(merc_seq[i][1]-merc_seq[i-1][1],merc_seq[i][0]-merc_seq[i-1][0])
            for j in range(n_points_to_impute):
                merc_seq_imputed.append([merc_seq[i-1][0] + (j+1)*dist_between_points*np.cos(angle),merc_seq[i-1][1] + (j+1)*dist_between_points*np.sin(angle)])
            merc_seq_imputed.append(merc_seq[i])
    return merc_seq_imputed
df_test['merc_seq_imputed'] = df_test.apply(lambda x: get_imputed_merc_seq(x['merc_seq_filtered'],x['ts_min_index']),axis=1)
display(df_test)

In [0]:
df_test['wgs_seq_imputed'] = df_test['merc_seq_imputed'].apply(meters2lonlat)
display(df_test)

In [0]:
!pip install folium
import folium

from folium import plugins

def display_traj(lon_lat_list, ts_list = None):
    coordinates = [[x[1],x[0]] for x in lon_lat_list]
    # Initialize map
    m = folium.Map(location=coordinates[0], zoom_start=15)


    # for lat,lon in pip_list:
    #     folium.Marker(
    #         location=[lat, lon],
    #         popup="Pip Location",
    #         icon=folium.Icon(color='red')  # Color can be 'red', 'blue', 'green', 'purple', etc.
    #     ).add_to(m)

    folium.Marker(
        location=coordinates[0],
        popup="Start Location",
        icon=folium.Icon(color='green')  # Color can be 'red', 'blue', 'green', 'purple', etc.
    ).add_to(m)
    # Add markers
    for i, (lat, lon) in enumerate(coordinates[1:]):
        if ts_list is not None:
            folium.Marker([lat, lon], popup=ts_list[i+1]).add_to(m)
        else:
            folium.Marker([lat, lon], popup=i+1).add_to(m)

    # Draw arrows between points
    for i in range(len(coordinates) - 1):
        start = coordinates[i]
        end = coordinates[i + 1]

        # Draw the line
        line = folium.PolyLine([start, end], color="blue", weight=3, opacity=0.7).add_to(m)

        # Add directional arrow
        plugins.PolyLineTextPath(
            line,
            '➤',  # arrow symbol
            repeat=True,
            offset=7,
            attributes={'fill': 'blue', 'font-weight': 'bold', 'font-size': '16'}
            ).add_to(m)

    # Save and show the map
    m.save("map_with_arrows.html")
    return m


In [0]:
lon_lat_list = [[-118.2491903,34.143541],[-118.2312814,34.14513],[-118.2027699,34.1062418],[-118.2149212,34.0893875],[-118.2255016,34.0847909],[-118.2016728,34.1051449],[-118.2027239,34.1063563],[-118.2152673,34.0778707],[-118.2049629,34.1075268],[-118.2027935,34.106452],[-118.2026069,34.1063128],[-118.2027695,34.1074327],[-118.2028208,34.1062947],[-118.2232401,34.1407364],[-118.2569078,34.1433059],[-118.256865,34.1435445],[-118.2572734,34.1444554],[-118.2418265,34.1429183],[-118.2035006,34.108211],[-118.2016728,34.1057851],[-118.2226625,34.0851249],[-118.2064365,34.1108443],[-118.2197235,34.1235245],[-118.2265297,34.1219972],[-118.203112,34.1131974],[-118.2082529,34.1172702],[-118.2031279,34.1131763],[-118.2030931,34.1133257],[-118.203123,34.1133915],[-118.2088545,34.1143289],[-118.2309159,34.1452212],[-118.2491903,34.143541],[-118.2466651,34.1463677],[-118.2480939,34.1447743],[-118.2467169,34.1464383]]

In [0]:
display_traj(lon_lat_list)

In [0]:
lon_lat_list = [(40.43116766093053, 40.38062059358551),
 (40.40652216879592, 40.38045216033828),
 (40.4057085377607, 40.383764670450745),
 (40.43660827574164, 40.37958192099031),
 (40.436558649018274, 40.37537106403637),
 (40.457481350981766, 40.361531130966995),
 (40.42744431568493, 40.22707012054006),
 (40.34843588499669, 40.245153850686116),
 (40.34314180518816, 40.24652975984849),
 (40.34251864901827, 40.24832685998045),
 (40.367953511255145, 40.290865700480786),
 (40.38198073465079, 40.32517491711977),
 (40.410793149018275, 40.33693833679479),
 (40.418285451558965, 40.370486425532526),
 (40.43912960241792, 40.38062059358551)]

In [0]:
display_traj(lon_lat_list)

In [0]:
traj_id = "16808692_2024-06-18"

display_traj(df_test[df_test['traj_id']==traj_id]['wgs_seq_filtered'].iloc[0], df_test[df_test['traj_id']==traj_id]['ts_min_index'].iloc[0])

In [0]:
display_traj(df_test[df_test['traj_id']==traj_id]['wgs_seq_imputed'].iloc[0])

## Apply Imputation on Test set

In [0]:
# df_test = spark.read.parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/nyc/inference_data_v3/traj_test_df_v3_with_ts.parquet")
df_test = spark.read.table("main_prod.datascience_scratchpad.nyc_train_with_time_index")
display(df_test)

In [0]:
import numpy as np

def dist(a, b):
    return np.sqrt(np.square(a[0] - b[0]) + np.square(a[1] - b[1]))

def get_imputed_merc_seq(merc_seq, ts_min_idx_list):
    merc_seq_imputed = [list(map(float, merc_seq[0]))]  # Ensure native Python floats

    for i in range(1, len(ts_min_idx_list)):
        if ts_min_idx_list[i] - ts_min_idx_list[i - 1] <= 1:
            merc_seq_imputed.append(list(map(float, merc_seq[i])))
        else:
            n_points_to_impute = ts_min_idx_list[i] - ts_min_idx_list[i - 1] - 1
            dist_between_points = float(dist(merc_seq[i], merc_seq[i - 1]) / (n_points_to_impute + 1))
            angle = np.arctan2(merc_seq[i][1] - merc_seq[i - 1][1], merc_seq[i][0] - merc_seq[i - 1][0])
            
            for j in range(n_points_to_impute):
                x = float(merc_seq[i - 1][0] + (j + 1) * dist_between_points * np.cos(angle))
                y = float(merc_seq[i - 1][1] + (j + 1) * dist_between_points * np.sin(angle))
                merc_seq_imputed.append([x, y])
            
            merc_seq_imputed.append(list(map(float, merc_seq[i])))

    return merc_seq_imputed


In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType, DoubleType

# Define the UDF
get_imputed_merc_seq_udf = udf(get_imputed_merc_seq, ArrayType(ArrayType(DoubleType())))

# Apply the UDF to create a new column
df_test = df_test.withColumn('merc_seq_imputed', get_imputed_merc_seq_udf(df_test['merc_seq_filtered'], df_test['time_index_list']))

display(df_test)

In [0]:
df_test.createOrReplaceTempView("train_imputed")
query = """
select * from train_imputed where traj_id like '4474029%'
"""
df_test_filtered = spark.sql(query)
display(df_test_filtered)

In [0]:
# from pyspark.sql.functions import col
# df_test_filtered = df_test.filter(col('traj_id').like("4474029_"))
df_test_pd = df_test_filtered.toPandas()

In [0]:
df_test_pd

In [0]:
traj_id = "4474029_2019-10-07"

display_traj(df_test_pd[df_test_pd['traj_id']==traj_id]['wgs_seq_filtered'].iloc[0], df_test_pd[df_test_pd['traj_id']==traj_id]['time_index_list'].iloc[0])

In [0]:
df_test_pd['wgs_seq_imputed'] = df_test_pd['merc_seq_imputed'].apply(meters2lonlat)
display(df_test_pd)

In [0]:
display_traj(df_test_pd[df_test_pd['traj_id']==traj_id]['wgs_seq_imputed'].iloc[0])

In [0]:
# df_test.write.mode("overwrite").parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/nyc/inference_data_v3/traj_test_df_v3_imputed.parquet")

In [0]:
df_test.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_train_imputed")