In [0]:
query = """
select * from clientstate.full_locations
"""

df = spark.sql(query)
# df.createOrReplaceTempView("gps_data")
display(df)


In [0]:
df_fil = df.where(
  "to_date(from_unixtime(location_timestamp/1000)) >= date_sub(current_date(), 20) and " +
  "to_date(from_unixtime(location_timestamp/1000)) < current_date()"
)
display(df_fil)

In [0]:
# df_fil.selectExpr("min(location_timestamp)", "max(location_timestamp)").show()

In [0]:
from pyspark.sql import functions as F

userpiphistory = spark.read.table("main_prod.datascience.userpiphistory")

# keep only valid times (optional but wise)
userpiphistory = userpiphistory.filter(F.col("createdon").isNotNull())

userid_timezone_df = (
    userpiphistory.groupBy("userid")
      .agg(F.max(F.struct("createdon", "timezone")).alias("maxrow"))
      .select(
          "userid",
          F.col("maxrow.timezone").alias("timezone"),
      )
).where(F.col("timezone").isNotNull())

display(userid_timezone_df)

In [0]:
df_with_tz = df_fil.join(userid_timezone_df, ["userid"], "inner")

df_with_tz.createOrReplaceTempView("df_with_tz")
display(df_with_tz)

In [0]:
query = """
select distinct userid, latitude, longitude, from_utc_timestamp(from_unixtime(location_timestamp / 1000), timezone) AS localized_timestamp, timezone from df_with_tz"""

df = spark.sql(query)
df.createOrReplaceTempView("gps_data_loc_ts")
display(df)

In [0]:
df_v2 = df.where("latitude is not NULL and longitude is not NULL and localized_timestamp is not NULL and timezone is not NULL")
display(df_v2)

In [0]:
from datetime import datetime
from zoneinfo import ZoneInfo
from pyspark.sql.types import *
from pyspark.sql.functions import udf

def get_current_date_in_tz(tz):
    try:
        tz_time = datetime.now(ZoneInfo(tz))
        tz_date = tz_time.date()
        return tz_date
    except:
        return None

get_current_date_in_tz_udf = udf(get_current_date_in_tz, DateType())

df_v3  = df_v2.withColumn("current_tz_date", get_current_date_in_tz_udf("timezone"))
display(df_v3)


In [0]:
df_v4 = df_v3.where("current_tz_date is not NULL")
display(df_v4)

In [0]:
df_v4.createOrReplaceTempView("all_traj_data_loc_ts")

In [0]:
query = """
SELECT 
    userid,
    DATE(localized_timestamp) AS traj_date,
    COLLECT_LIST(localized_timestamp) AS timestamps,
    COLLECT_LIST(latitude) AS latitudes,
    COLLECT_LIST(longitude) AS longitudes
FROM 
    all_traj_data_loc_ts
GROUP BY 
    userid, DATE(localized_timestamp)
ORDER BY 
    userid, traj_date
"""

result_df = spark.sql(query)
display(result_df)

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType
def sort_by_time(timestamps, longitudes, latitudes):
    sorted_ts = []
    polylines = []
    for ts, lon, lat in sorted(zip(timestamps, longitudes, latitudes), key=lambda x: x[0]):
        sorted_ts.append(ts)
        polylines.append([float(lon), float(lat)])
    return sorted_ts, polylines


sort_and_extract_udf = udf(sort_by_time, 
                           StructType([
                               StructField("sorted_ts", ArrayType(TimestampType())),
                               StructField("polylines", ArrayType(ArrayType(DoubleType())))
                           ]))


result_df_v2 = result_df.withColumn("sorted_data", 
                                         sort_and_extract_udf("timestamps", "longitudes", "latitudes"))

result_df_v3 = result_df_v2.withColumn("sorted_ts", col("sorted_data.sorted_ts")) \
                             .withColumn("wgs_seq", col("sorted_data.polylines")) \
                             .drop("sorted_data", "timestamps", "longitudes", "latitudes")
display(result_df_v3)

In [0]:
# result_df_v3 = result_df_v2.drop("timestamps", "longitudes", "latitudes")
# display(result_df_v3)

In [0]:
from pyspark.sql.types import IntegerType, ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import udf, col

import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t + 1e-5) / (1 - t + 1e-5))


lonlat2meters_udf = udf(lambda traj: [list(lonlat2meters(p[0], p[1])) for p in traj], ArrayType(ArrayType(DoubleType())))
result_df_v4 = result_df_v3.withColumn("merc_seq", lonlat2meters_udf(col("wgs_seq")))
display(result_df_v4)

In [0]:
# result_df_v4.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.result_df_v4")

In [0]:
def filter_based_on_timestamps(ts_list):
    unique_hours = set()
    for ts in ts_list:
        unique_hours.add(ts.hour)
    if len(unique_hours) >= 7:
        return True
    return False

filter_based_on_timestamps_udf = udf(filter_based_on_timestamps, BooleanType())
result_df_v5 = result_df_v4.filter(filter_based_on_timestamps_udf(col("sorted_ts")))
display(result_df_v5)

In [0]:
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType
from pyspark.sql.functions import col


def udf_get_min_max_lat_lon(wgs_seq):
    min_lat = 100000
    max_lat = -100000
    min_lon = 100000
    max_lon = -100000

    for lon, lat in wgs_seq:
        if lat < min_lat:
            min_lat = lat
        if lat > max_lat:
            max_lat = lat
        if lon < min_lon:
            min_lon = lon
        if lon > max_lon:
            max_lon = lon
    return min_lat, max_lat, min_lon, max_lon

get_min_max_lat_lon_udf = udf(udf_get_min_max_lat_lon, StructType([
    StructField("min_lat", DoubleType()),
    StructField("max_lat", DoubleType()),
    StructField("min_lon", DoubleType()),
    StructField("max_lon", DoubleType())
]))
result_df_v6 = result_df_v5.withColumn("min_max_lat_lon", get_min_max_lat_lon_udf(col("wgs_seq")))
result_df_v6 = result_df_v6.withColumn("min_lat", col("min_max_lat_lon.min_lat")) \
       .withColumn("max_lat", col("min_max_lat_lon.max_lat")) \
       .withColumn("min_lon", col("min_max_lat_lon.min_lon")) \
       .withColumn("max_lon", col("min_max_lat_lon.max_lon")) \
       .drop("min_max_lat_lon")

display(result_df_v6)

In [0]:
target_min_lat = 25.11833
target_max_lat = 49.38447
target_min_lon = -124.73306
target_max_lon = -66.94978

df_fil = result_df_v6.filter((col("min_lat") >= target_min_lat) & (col("max_lat") <= target_max_lat) & (col("min_lon") >= target_min_lon) & (col("max_lon") <= target_max_lon))
df_fil = df_fil.drop("min_lat", "max_lat", "min_lon", "max_lon")
display(df_fil)

In [0]:
# %sql
# MERGE INTO main_prod.datascience_scratchpad.traj_data AS target
# USING traj_data AS source
# ON target.userid = source.userid
#    AND target.traj_date = source.traj_date
# WHEN MATCHED THEN 
#   UPDATE SET *
# WHEN NOT MATCHED THEN
#   INSERT *

In [0]:
df_fil.repartition(100).write.mode("overwrite").parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/last_20_days")