In [0]:
# query = f"""SELECT *
# FROM clientstate.full_locations 
# WHERE latitude BETWEEN 40.4774 AND 40.9176 
#   AND longitude BETWEEN -74.2591 AND -73.7004"""

# df = spark.sql(query)

In [0]:
# df.write.format("delta").saveAsTable("main_prod.datascience_scratchpad.nyc_traj_data")

In [0]:
query = """
select * from main_prod.datascience_scratchpad.nyc_traj_data
"""
temp_df = spark.sql(query)
temp_df.createOrReplaceTempView("temp_df")
display(temp_df)

In [0]:
# convert timestamps for all the datapoints in the df

query = """
select distinct nyc_traj_data.userid, nyc_traj_data.latitude, nyc_traj_data.longitude, from_utc_timestamp(from_unixtime(nyc_traj_data.location_timestamp / 1000), tz.timezone) AS localized_timestamp from temp_df nyc_traj_data join main_prod.datascience.userpiphistory tz on nyc_traj_data.userid = tz.userid order by localized_timestamp"""

df = spark.sql(query)
df.createOrReplaceTempView("nyc_traj_data_loc_ts")
display(df)


In [0]:
query = """
SELECT 
    userid,
    DATE(localized_timestamp) AS date,
    COLLECT_LIST(localized_timestamp) AS timestamps,
    COLLECT_LIST(latitude) AS latitudes,
    COLLECT_LIST(longitude) AS longitudes
FROM 
    nyc_traj_data_loc_ts
WHERE
    userid IS NOT NULL 
    AND localized_timestamp IS NOT NULL 
    AND latitude IS NOT NULL 
    AND longitude IS NOT NULL
GROUP BY 
    userid, DATE(localized_timestamp)
ORDER BY 
    userid, date
"""

result_df = spark.sql(query)
display(result_df)

In [0]:
result_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_userid_traj_ts_data")

In [0]:
query = """
SELECT 
    userid, 
    COUNT(*) AS frequency
FROM 
    main_prod.datascience_scratchpad.nyc_traj_data
where userid != 0
GROUP BY 
    userid
ORDER BY 
    frequency DESC
limit 50000
"""

frequent_users_df = spark.sql(query)
display(frequent_users_df)

In [0]:
frequent_users_df.createOrReplaceTempView("frequent_users_df")

In [0]:
# join on frequent users
query = """
SELECT nyc_traj_data.* FROM frequent_users_df frequent_users_df join main_prod.datascience_scratchpad.nyc_traj_data nyc_traj_data on nyc_traj_data.userid = frequent_users_df.userid
"""
nyc_frequent_users_df = spark.sql(query)
nyc_frequent_users_df.createOrReplaceTempView("nyc_frequent_users_df")
display(nyc_frequent_users_df)

In [0]:
# get max and min userid
query = """
SELECT MAX(userid) AS max_userid, MIN(userid) AS min_userid FROM frequent_users_df
"""
max_min_df = spark.sql(query)
display(max_min_df)

In [0]:
%sql
select count(*) from nyc_frequent_users_df

In [0]:
%sql
select sum(frequency) from frequent_users_df

In [0]:
query = """
select distinct userid, latitude, longitude, from_utc_timestamp(from_unixtime(location_timestamp / 1000), 'America/New_York') AS localized_timestamp from nyc_frequent_users_df where location_timestamp>1546300800000 order by localized_timestamp"""

df = spark.sql(query)
df.createOrReplaceTempView("nyc_traj_data_loc_ts")
display(df)

In [0]:
query = """
select userid, latitude, longitude, localized_timestamp, concat(userid, '_', Date(localized_timestamp)) as traj_id from nyc_traj_data_loc_ts
"""
nyc_data_df_traj_id = spark.sql(query)
nyc_data_df_traj_id.createOrReplaceTempView("nyc_data_df_traj_id")
display(nyc_data_df_traj_id)


In [0]:
%sql
select * from main_prod.datascience_scratchpad.nyc_data_df_traj_id limit 100

In [0]:
nyc_data_df_traj_id.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_data_df_traj_id")

In [0]:
%sql
select count(*) from nyc_traj_data_loc_ts

In [0]:
%sql
select * from main_prod.datascience_scratchpad.nyc_data_df_traj_id where traj_id = '19846421_2025-03-19' order by localized_timestamp

In [0]:
%sql
SELECT *
  FROM main_prod.datascience_scratchpad.nyc_data_df_traj_id where traj_id = '618612_2025-07-04'
  ORDER BY traj_id, localized_timestamp 

In [0]:
query = """
SELECT 
  userid,
  traj_id,
  
  -- Get array of [longitude, latitude] as array of DOUBLEs
  transform(
    sort_array(collect_list(struct(localized_timestamp, array(
      cast(longitude as double),
      cast(latitude as double)
    )))),
    x -> x.col2
  ) AS polylines,

  -- Extract timestamps
  transform(
    sort_array(collect_list(struct(localized_timestamp, array(
      longitude, latitude
    )))),
    x -> x.localized_timestamp
  ) AS timestamps

FROM main_prod.datascience_scratchpad.nyc_data_df_traj_id
GROUP BY traj_id, userid;
"""

df = spark.sql(query)
display(df)

In [0]:
df.select("*").count()

In [0]:
userids = df.select("userid").distinct()

In [0]:
train_userids = userids.sample(False, 0.8)
test_userids = userids.subtract(train_userids)
display(train_userids.count())

In [0]:
display(test_userids.count())

In [0]:
train_df = df.join(train_userids, on="userid", how="inner")
display(train_df)

In [0]:
test_df = df.join(test_userids, on="userid", how="inner")
display(test_df)

In [0]:
train_df.select('userid').distinct().count()


In [0]:
test_df.select('userid').distinct().count()

In [0]:
train_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_train_df_v2")
test_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_test_df_v2")

In [0]:
train_df.count(), test_df.count()

In [0]:

s3_train_path = "s3://ml-datasets-datalakeprod-us-west-2-sagemaker/jatin-trajcl-exp/train.csv"


train_df.write.mode("overwrite").csv(s3_train_path)



In [0]:
from pyspark.sql.functions import to_json, col
s3_test_path = "s3://ml-datasets-datalakeprod-us-west-2-sagemaker/jatin-trajcl-exp/test.csv"

test_df_clean = test_df.withColumn("polylines", to_json(col("polylines"))) \
             .withColumn("timestamps", to_json(col("timestamps")))

display(test_df_clean)

In [0]:

s3_test_path = "s3://earnin-prod-datalake-us-west-2-dl-scratchpad/jatin/trajcl-exp/test.csv"
test_df_clean.write.mode("overwrite").csv(s3_test_path)

In [0]:
train_df = spark.table("main_prod.datascience_scratchpad.nyc_train_df")

train_df_clean = train_df.withColumn("polylines", to_json(col("polylines"))) \
             .withColumn("timestamps", to_json(col("timestamps")))

display(train_df_clean)

In [0]:
s3_train_path = "s3://earnin-prod-datalake-us-west-2-dl-scratchpad/jatin/trajcl-exp/train.csv"

train_df_clean.write.mode("overwrite").csv(s3_train_path)