In [0]:
query = """
select * from main_prod.datascience_scratchpad.all_traj_data_test
"""
temp_df = spark.sql(query)
temp_df.createOrReplaceTempView("all_traj_data_test")
display(temp_df)

In [0]:
!pip install timezonefinder

In [0]:
latlon_to_timezone(40.8817792,-73.8167686 )

In [0]:
from timezonefinder import TimezoneFinder
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Initialize timezone finder


# Define a UDF
def latlon_to_timezone(lat, lon):
    try:
        tf = TimezoneFinder()
        return tf.timezone_at(lng=lon, lat=lat)
    except:
        return None

# Register UDF
get_timezone_udf = udf(latlon_to_timezone, StringType())

# Apply to your dataframe
df_with_tz = temp_df.withColumn("timezone", get_timezone_udf("latitude", "longitude"))

display(df_with_tz)

In [0]:
query = """
select *, from_utc_timestamp(from_unixtime(location_timestamp/1000), 'America/New_York') as localized_ts from nyc_traj_data
"""
df = spark.sql(query)
display(df)

In [0]:
# keep timestamps after 2022 and before 23rd July 2025
df = df.filter((df.localized_ts >= '2022-01-01') & (df.localized_ts < '2025-07-23'))
display(df)

In [0]:
df.createOrReplaceTempView("all_traj_data_test")


In [0]:
query = """
select * from all_traj_data_test where userid != '0' and userid is not null and localized_ts is not null and latitude is not null and longitude is not null
"""
df_fil = spark.sql(query)

df_fil.count()

In [0]:
df_fil.createOrReplaceTempView("nyc_traj_data_loc_ts")

In [0]:

query = """
SELECT 
    userid,
    DATE(localized_ts) AS traj_date,
    COLLECT_LIST(localized_ts) AS timestamps,
    COLLECT_LIST(latitude) AS latitudes,
    COLLECT_LIST(longitude) AS longitudes
FROM 
    nyc_traj_data_loc_ts
WHERE
    userid IS NOT NULL 
    AND localized_ts IS NOT NULL 
    AND latitude IS NOT NULL 
    AND longitude IS NOT NULL
GROUP BY 
    userid, DATE(localized_ts)
ORDER BY 
    userid, traj_date
"""

result_df = spark.sql(query)
display(result_df)

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType
def sort_by_time(timestamps, longitudes, latitudes):
    sorted_ts = []
    polylines = []
    for ts, lon, lat in sorted(zip(timestamps, longitudes, latitudes), key=lambda x: x[0]):
        sorted_ts.append(ts)
        polylines.append([float(lon), float(lat)])
    return sorted_ts, polylines


sort_and_extract_udf = udf(sort_by_time, 
                           StructType([
                               StructField("sorted_ts", ArrayType(TimestampType())),
                               StructField("polylines", ArrayType(ArrayType(DoubleType())))
                           ]))


result_df_v2 = result_df.withColumn("sorted_data", 
                                         sort_and_extract_udf("timestamps", "longitudes", "latitudes"))

result_df_v2 = result_df_v2.withColumn("sorted_ts", col("sorted_data.sorted_ts")) \
                             .withColumn("polylines", col("sorted_data.polylines")) \
                             .drop("sorted_data")
display(result_df_v2)

In [0]:
result_df_v3 = result_df_v2.drop("timestamps", "longitudes", "latitudes")
display(result_df_v3)

In [0]:
from pyspark.sql.types import IntegerType, ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import udf, col

import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t) / (1 - t))


lonlat2meters_udf = udf(lambda traj: [list(lonlat2meters(p[0], p[1])) for p in traj], ArrayType(ArrayType(DoubleType())))
result_df_v4 = result_df_v3.withColumn("merc_seq", lonlat2meters_udf(col("polylines")))
display(result_df_v4)

In [0]:
def filter_based_on_timestamps(ts_list):
    unique_hours = set()
    for ts in ts_list:
        unique_hours.add(ts.hour)
    if len(unique_hours) > 7:
        return True
    return False

filter_based_on_timestamps_udf = udf(filter_based_on_timestamps, BooleanType())
result_df_v5 = result_df_v4.filter(filter_based_on_timestamps_udf(col("sorted_ts")))
display(result_df_v5)

In [0]:
# rename columns

result_df_v6 = result_df_v5.withColumnRenamed("sorted_ts", "timestamps").withColumnRenamed("polylines", "wgs_seq")


result_df_v6.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_traj_all_ts")

In [0]:
result_df_v6.count()

In [0]:
import math
math.log(50), math.log(480), math.log(2), math.log(28800)

In [0]:
import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t) / (1 - t))


min_lon_lat = [-124.73306, 25.11833]
max_lon_lat = [-66.94978, 49.38447]


x_min,y_min = lonlat2meters(min_lon_lat[0], min_lon_lat[1])
x_max,y_max = lonlat2meters(max_lon_lat[0], max_lon_lat[1])

In [0]:
x_min, y_min, x_max, y_max, (x_max-x_min)/1000, (y_max-y_min)/1000

In [0]:
lonlat2meters(-117.7394035, 34.0157826)

In [0]:
5000/6000, 4999/6000, 5001/6000

In [0]:
math.log(5000/6000), math.log(4999/6000), math.log(5010/6000)

In [0]:

min_lon_lat = [0, 0]
max_lon_lat = [0.9, 0.9]


x_min,y_min = lonlat2meters(min_lon_lat[0], min_lon_lat[1])
x_max,y_max = lonlat2meters(max_lon_lat[0], max_lon_lat[1])

In [0]:
x_min, y_min, x_max, y_max, (x_max-x_min)/1000, (y_max-y_min)/1000

In [0]:
def meters2lonlat(x, y):
    semimajoraxis = 6378137.0
    lon = x / semimajoraxis / 0.017453292519943295
    t = math.exp(y / 3189068.5)
    lat = math.asin((t - 1) / (t + 1)) / 0.017453292519943295
    return lon, lat

meters2lonlat(50000,50000)

In [0]:
%sql
select * from main_prod.datascience.userpiphistory limit 10