In [0]:
query = """
select * from main_prod.datascience_scratchpad.all_traj_test_v2
"""
df = spark.sql(query)
df.createOrReplaceTempView("all_traj_test_data")
display(df)

In [0]:
query = """
select *, from_utc_timestamp(from_unixtime(location_timestamp/1000), 'America/Los_Angeles') as localized_ts from la_traj_data
"""
df = spark.sql(query)
display(df)


In [0]:
# keep timestamps after 2022 and before 23rd July 2025
df = df.filter((df.localized_ts >= '2022-01-01') & (df.localized_ts < '2025-07-23'))
display(df)

In [0]:
# df.count()

In [0]:
# train_df = spark.read.table("main_prod.datascience_scratchpad.nyc_train_filtered")
# display(train_df)

In [0]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, col
# def get_userid(traj_id):
#     userid, traj_date = traj_id.split("_")
#     return int(userid)


# udf_get_userid = udf(get_userid, IntegerType())
# train_df_v2 = train_df.withColumn('userid', udf_get_userid(col('traj_id')))
# display(train_df_v2)

In [0]:
# df_filtered = df.join(train_df_v2, on='userid', how='left_anti')

# display(df_filtered)

In [0]:
df.select(col('userid')).distinct().count()

In [0]:
query = """
select * from main_prod.earnings_analysis.fact_user_earnings_daily
"""
earnings_df = spark.sql(query)
display(earnings_df)

In [0]:
earnings_df_fil = earnings_df.join(df, on='userid', how='inner')
display(earnings_df_fil)

In [0]:
earnings_df_fil.createOrReplaceTempView("earnings_df_fil")

In [0]:
query = """
select userid, employername, paydate, max(total_pck_amt) as pck_amt from earnings_df_fil where paydate >= '2022-01-01' and paydate < '2025-07-23' and total_pck_amt is not null group by userid, employername, paydate
"""
earnings_df_agg = spark.sql(query)
display(earnings_df_agg)

In [0]:
# earnings_df_agg.select(col('userid')).distinct().count()
earnings_df_agg.createOrReplaceTempView("earnings_df_agg")

In [0]:
# earnings_df_agg.select(col('userid')).distinct().count()

In [0]:

from pyspark.sql.functions import collect_list, struct, col, expr
user_emp_paydate_df = (
    earnings_df_agg
    .filter("userid IS NOT NULL AND employername IS NOT NULL AND paydate IS NOT NULL AND pck_amt IS NOT NULL")
    .select("userid", "employername", struct("paydate", "pck_amt").alias("entry"))
    .groupBy("userid", "employername")
    .agg(collect_list("entry").alias("entries"))
    .withColumn("sorted_entries", expr("array_sort(entries)"))
    .withColumn("paydates", expr("transform(sorted_entries, x -> x.paydate)"))
    .withColumn("pck_amts", expr("transform(sorted_entries, x -> x.pck_amt)"))
    .select("userid", "employername", "paydates", "pck_amts")
)

In [0]:
display(user_emp_paydate_df)

In [0]:
import pandas as pd
def partition_6months(paydates, pck_amts):
    partitions = []
    start_idx = 0
    for i in range(1, len(paydates)):
        if paydates[i] - paydates[start_idx] >= pd.Timedelta(days=180):
            partitions.append((paydates[start_idx:i], pck_amts[start_idx:i]))
            start_idx = i
        elif i == len(paydates) - 1:
            partitions.append((paydates[start_idx:], pck_amts[start_idx:]))
    return partitions
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType, DateType, DoubleType, DecimalType

partitions_udf = udf(partition_6months, ArrayType(StructType([
    StructField("paydates", ArrayType(DateType())),
    StructField("pck_amts", ArrayType(DecimalType(19, 2)))
    ]))) 

user_emp_paydates_partitioned_df = user_emp_paydate_df.withColumn("partitions", partitions_udf(col("paydates"), col("pck_amts")))
display(user_emp_paydates_partitioned_df)

In [0]:
# remove data with no partitions
user_emp_paydates_partitioned_df_fil = user_emp_paydates_partitioned_df.filter("size(partitions) > 0")
display(user_emp_paydates_partitioned_df_fil)

In [0]:
user_emp_paydates_partitioned_df_fil.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.user_emp_paydates_partitioned_df_fil_all")

In [0]:
user_emp_paydates_partitioned_df_fil = spark.table("main_prod.datascience_scratchpad.user_emp_paydates_partitioned_df_fil")
display(user_emp_paydates_partitioned_df_fil)


In [0]:
def filter_less_than_3_pos_paycycle(partitions):
    new_partitions = []

    for partition in partitions:
        paydates = partition["paydates"]
        pck_amts = partition["pck_amts"]
        count=0
        for amt in pck_amts:
            if amt > 0:
                count+=1
        if count>=3:
            new_partitions.append(partition)
    return new_partitions
from pyspark.sql.types import ArrayType, StructType, StructField, DateType, DecimalType
from pyspark.sql.functions import col
filter_less_than_3_pos_paycycle_udf = udf(filter_less_than_3_pos_paycycle, ArrayType(StructType([

    StructField("paydates", ArrayType(DateType())),
    StructField("pck_amts", ArrayType(DecimalType(19, 2)))
    ]))) 

user_emp_paydates_partitioned_df_fil_valid = user_emp_paydates_partitioned_df_fil.withColumn("partitions", filter_less_than_3_pos_paycycle_udf(col("partitions")))
display(user_emp_paydates_partitioned_df_fil_valid)

In [0]:
user_emp_paydates_partitioned_df_fil_valid = user_emp_paydates_partitioned_df_fil.filter("size(partitions) > 0")
display(user_emp_paydates_partitioned_df_fil_valid)

In [0]:
user_emp_paydates_partitioned_df_fil_valid.count()

In [0]:
user_emp_paydates_df_pd = user_emp_paydates_partitioned_df_fil_valid.toPandas()

In [0]:
display(user_emp_paydates_df_pd.head())

In [0]:
len(user_emp_paydates_df_pd['userid'].unique())

In [0]:
# save pandas df as parquet file
user_emp_paydates_df_pd.to_parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/test/user_emp_paydates_pck_df_all.parquet")

In [0]:
import pandas as pd

user_emp_paydates_df_pd = pd.read_parquet("Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/usa/test/user_emp_paydates_pck_df_all.parquet")


In [0]:
df.createOrReplaceTempView("all_traj_data_loc_ts")

In [0]:
df.count()

In [0]:
query = """
select * from la_traj_data_loc_ts where userid != '0' and userid is not null and localized_ts is not null and latitude is not null and longitude is not null
"""
df_fil = spark.sql(query)

df_fil.count()

In [0]:
df_fil.createOrReplaceTempView("la_traj_data_loc_ts")

In [0]:

query = """
SELECT 
    userid,
    DATE(localized_ts) AS traj_date,
    COLLECT_LIST(localized_ts) AS timestamps,
    COLLECT_LIST(latitude) AS latitudes,
    COLLECT_LIST(longitude) AS longitudes
FROM 
    la_traj_data_loc_ts
WHERE
    userid IS NOT NULL 
    AND localized_ts IS NOT NULL 
    AND latitude IS NOT NULL 
    AND longitude IS NOT NULL
GROUP BY 
    userid, DATE(localized_ts)
ORDER BY 
    userid, traj_date
"""

result_df = spark.sql(query)
display(result_df)

In [0]:
display(df.where("userid = '14653656'").select("userid", "localized_ts","longitude", "latitude"))

In [0]:
result_df_fil = result_df.where("userid != '0'")
display(result_df_fil)


In [0]:
display(result_df_fil.where("userid = '118'"))

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType
def sort_by_time(timestamps, longitudes, latitudes):
    sorted_ts = []
    polylines = []
    for ts, lon, lat in sorted(zip(timestamps, longitudes, latitudes), key=lambda x: x[0]):
        sorted_ts.append(ts)
        polylines.append([float(lon), float(lat)])
    return sorted_ts, polylines


sort_and_extract_udf = udf(sort_by_time, 
                           StructType([
                               StructField("sorted_ts", ArrayType(TimestampType())),
                               StructField("polylines", ArrayType(ArrayType(DoubleType())))
                           ]))


result_df_v2 = result_df_fil.withColumn("sorted_data", 
                                         sort_and_extract_udf("timestamps", "longitudes", "latitudes"))

result_df_v2 = result_df_v2.withColumn("sorted_ts", col("sorted_data.sorted_ts")) \
                             .withColumn("polylines", col("sorted_data.polylines")) \
                             .drop("sorted_data")
display(result_df_v2)

In [0]:
result_df_v3 = result_df_v2.drop("timestamps", "longitudes", "latitudes")
display(result_df_v3)

In [0]:
from pyspark.sql.functions import unix_timestamp, col
from pyspark.sql.types import IntegerType

def get_distant_ts(timestamps):
    final_indices = [0]
    diff=0
    for i in range(1,len(timestamps)):
        diff += (timestamps[i] - timestamps[i-1]).total_seconds()
        if diff > 600:
            final_indices.append(i)
            diff=0
    return final_indices

get_distant_ts_udf = udf(lambda ts: get_distant_ts(ts), ArrayType(IntegerType()))

result_df_v4 = result_df_v3.withColumn("final_indices", get_distant_ts_udf(col("sorted_ts")))
display(result_df_v4)

In [0]:
result_df_v4.createOrReplaceTempView("la_traj_data_loc_ts_v4")

In [0]:
result_df_v4.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.la_traj_data_v4")

In [0]:
query = """
select * from la_traj_data_loc_ts_v4 where size(final_indices) >= 15
"""
result_df_v5 = spark.sql(query)
display(result_df_v5)

In [0]:
result_df_v5.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.la_traj_data_v5")

In [0]:
result_df_v5 = spark.read.table("main_prod.datascience_scratchpad.la_traj_data_v5")

In [0]:
display(result_df_v5)

In [0]:
result_df_v5.select('userid').distinct().count()

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType
def apply_filter_index(ts,polylines, final_indices):
    final_ts = []
    final_polylines = []
    for i in final_indices:
        final_ts.append(ts[i])
        final_polylines.append(polylines[i])
    return final_ts, final_polylines
apply_filter_index_udf = udf(lambda ts,polylines, final_indices: apply_filter_index(ts,polylines, final_indices), 
                           StructType([
                               StructField("final_ts", ArrayType(TimestampType())),
                               StructField("final_polylines", ArrayType(ArrayType(DoubleType())))
                           ]))
result_df_v6 = result_df_v5.withColumn("final_data", 
                                         apply_filter_index_udf(col("sorted_ts"), col("polylines"), col("final_indices")))
result_df_v6 = result_df_v6.withColumn("timestamps", col("final_data.final_ts")) \
                             .withColumn("polylines_final", col("final_data.final_polylines")) \
                             .drop("final_data","final_indices", "sorted_ts","polylines")
display(result_df_v6)

In [0]:
result_df_v6 = result_df_v6.withColumnRenamed("polylines_final", "wgs_seq")
result_df_v6 = result_df_v6.filter(col("wgs_seq").isNotNull())

In [0]:
import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t) / (1 - t))

lonlat2meters_udf = udf(lambda traj: [list(lonlat2meters(p[0], p[1])) for p in traj], ArrayType(ArrayType(DoubleType())))
result_df_v7 = result_df_v6.withColumn("merc_seq", lonlat2meters_udf(col("wgs_seq")))
display(result_df_v7)

In [0]:
la_test_df = spark.read.table("main_prod.datascience_scratchpad.la_traj_data_v9_part1")
display(la_test_df)

In [0]:
# remove rows from result_df_v7 where userid matches with la_test_df

result_df_v8 = result_df_v7.join(la_test_df, on='userid', how='left_anti')
display(result_df_v8)

In [0]:
nyc_df = spark.read.table("main_prod.datascience_scratchpad.nyc_test_filtered")
display(nyc_df)
# remove rows from result_df_v7 where userid matches with nyc_df


In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType, StringType
def get_traj_id(userid, traj_date):
    return str(userid) + '_' + traj_date.strftime("%Y-%m-%d")
get_traj_id_udf = udf(lambda userid, traj_date: get_traj_id(userid, traj_date), StringType())
result_df_v9 = result_df_v8.withColumn("traj_id", get_traj_id_udf(col("userid"), col("traj_date")))
display(result_df_v9)

In [0]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, DateType, TimestampType, IntegerType
def get_traj_len(traj):
    return len(traj)
get_traj_len_udf = udf(lambda traj: get_traj_len(traj), IntegerType())
result_df_v10 = result_df_v9.withColumn("traj_len", get_traj_len_udf(col("merc_seq")))
display(result_df_v10)

In [0]:
la_df = result_df_v10.drop("traj_date")
# rename columns
la_df = la_df.withColumnRenamed("timestamps", "timestamps_filtered")
la_df = la_df.withColumnRenamed("merc_seq", "merc_seq_filtered")
la_df = la_df.withColumnRenamed("wgs_seq", "wgs_seq_filtered")
la_df = la_df.withColumnRenamed("traj_len", "trajlen")
display(la_df)

In [0]:
# concat la_df with nyc_df on same columns
la_df = la_df.select("userid", "traj_id","timestamps_filtered","wgs_seq_filtered", "merc_seq_filtered", "trajlen" )

combined_df = nyc_df.union(la_df)
display(combined_df)

In [0]:
combined_df.count()

In [0]:
combined_df.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.la_nyc_traj_data_v1")

In [0]:
display(user_emp_paydates_df_pd.head())

In [0]:
display(df)

In [0]:
import bisect
from pyspark.sql.types import IntegerType, StringType, StructType, StructField, DoubleType, DecimalType
def get_emp_pck_partitionid(userid, traj_date):
    try:
        df = user_emp_paydates_df_pd[user_emp_paydates_df_pd['userid'] == userid].reset_index(drop=True)
        partitions = list(df['partitions'])
        paydates_list = list(df['paydates'])
        idx = -1
        for i,paydates in enumerate(paydates_list):
            
            first_paydate = paydates[0]
            last_paydate = paydates[-1]
            # print(traj_date, first_paydate, last_paydate)
            if traj_date< last_paydate and (traj_date>=first_paydate or (first_paydate - traj_date).days<7):
                idx = i
                break
        if idx!=-1:
            row = df.iloc[idx]
            employername = row['employername']
            partition = row['partitions']
            paydates = paydates_list[idx]
            paydate_idx = bisect.bisect_right(paydates, traj_date)
            paydate = paydates[paydate_idx]
            pck_amts = row['pck_amts']
            pck_amt = pck_amts[paydate_idx]
            idx_in_partition = None
            partition_id = -1
            for i in range(len(partition)):
                if paydate in partition[i]['paydates']:
                    partition_id = i
                    idx_in_partition = bisect.bisect_left(partition[i]['paydates'], paydate)
                    
            return employername, partition_id, pck_amt, idx_in_partition
        else:
            return None, None, None, None
    except:
        return None, None, None, None


get_emp_pck_partitionid_udf = udf(lambda userid, traj_date: get_emp_pck_partitionid(userid, traj_date), 
                           StructType([
                               StructField("employername", StringType()),
                               StructField("partition_id", IntegerType()),
                               StructField("pck_amt", DecimalType(19,2)),
                               StructField("idx_in_partition", IntegerType())
                           ]))
result_df_v7 = df.withColumn("emp_pck_partitionid", 
                                         get_emp_pck_partitionid_udf(col("userid"), col("traj_date")))
# result_df_v7 = result_df_v7.withColumn("employername", col("emp_pck_partitionid.employername")) \
#                              .withColumn("partition_id", col("emp_pck_partitionid.partition_id")) \
#                              .withColumn("pck_amt", col("emp_pck_partitionid.pck_amt")) \
#                              .withColumn("set_type", col("emp_pck_partitionid.set_type")) \
#                              .drop("emp_pck_partitionid")
# dummy_v2 = dummy.withColumn("emp_pck_partitionid", 
#                                           get_emp_pck_partitionid_udf(col("userid"), col("traj_date")))
# display(dummy_v2)

# dummy_pd = dummy.toPandas()
# for i,row in dummy_pd.iterrows():
#     userid = row['userid']
#     traj_date = row['traj_date']
#     employername, partition_id, pck_amt, set_type = get_emp_pck_partitionid(userid, traj_date)
#     print(traj_date, employername, partition_id, pck_amt, set_type)
# display(result_df_v7)


In [0]:
display(result_df_v7)

In [0]:
result_df_v8 = result_df_v7.withColumn("employername", col("emp_pck_partitionid.employername")) \
                             .withColumn("partition_id", col("emp_pck_partitionid.partition_id")) \
                             .withColumn("pck_amt", col("emp_pck_partitionid.pck_amt")) \
                             .withColumn("idx_in_partition", col("emp_pck_partitionid.idx_in_partition")) \
                             .drop("emp_pck_partitionid")

In [0]:
display(result_df_v8)

In [0]:
result_df_v9 = result_df_v8.where("employername is not NULL and pck_amt is not NULL and idx_in_partition is not NULL and partition_id is not NULL")
display(result_df_v9)

In [0]:
result_df_v9.select("userid").distinct().count()

In [0]:
# get the list of distinct userids

userid_list = result_df_v9.select("userid").distinct().toPandas()['userid'].tolist()


In [0]:
import numpy as np
#randomly select 50000 userids
np.random.seed(123)
random_sample = list(np.random.choice(userid_list, 1000, replace=False))

# filter result_df_v9 to only
result_df_v9_subset = result_df_v9.filter(col("userid").isin(random_sample))


In [0]:
display(result_df_v9_subset)

In [0]:
def pck_0_1(pck_amt):
    if pck_amt>0:
        return 1
    else:
        return 0
    
pck_0_1_udf = udf(lambda pck_amt: pck_0_1(pck_amt), IntegerType())
result_df_v9_v2 = result_df_v9_subset.withColumn("pck_0_1", pck_0_1_udf(col("pck_amt")))

result_df_v9_v2 = result_df_v9_v2.drop("pck_amt")
display(result_df_v9_v2)

In [0]:
result_df_v9_v2.count()

In [0]:
result_df_v9_v2.write.mode("overwrite").parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/la/result_df_v9.pkl")

In [0]:
result_df_v9_v2.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.la_traj_data_v9_subset_1k")

In [0]:
result_df_v9.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.all_traj_test_v9")

In [0]:
result_df_v9.createOrReplaceTempView("result_df_v9")


In [0]:
result_df_v9

In [0]:
col_list_1 = ["userid", "traj_date", "timestamps", "merc_seq", "wgs_seq"]
col_list_2 = ["userid", "traj_date", "employername", "partition_id", "pck_amt", "idx_in_partition"]

result_df_part1 = result_df_v9.select(col_list_1)
result_df_part2 = result_df_v9.select(col_list_2)

display(result_df_part1)


In [0]:
display(result_df_part2)

In [0]:
result_df_part2.createOrReplaceTempView("all_traj_data_v9_part2")

In [0]:
from pyspark.sql import functions as F
result_df_v10_part2 = (
    result_df_part2.select(
        "userid",
        "employername",
        F.col("partition_id"),
        F.col("idx_in_partition").alias("idx_in_partition")
    )
    .groupBy("userid", "employername", "partition_id")
    .agg(
        F.max("idx_in_partition").alias("max_idx_in_partition")
    )
)
display(result_df_v10_part2)

In [0]:
result_df_v10_part2.createOrReplaceTempView("result_df_v10_part2")

In [0]:
query = """
select all_traj_data_v9_part2.*, result_df_v10_part2.max_idx_in_partition as max_idx_in_partition from all_traj_data_v9_part2 as all_traj_data_v9_part2 join result_df_v10_part2 on all_traj_data_v9_part2.userid = result_df_v10_part2.userid and all_traj_data_v9_part2.partition_id = result_df_v10_part2.partition_id and all_traj_data_v9_part2.employername = result_df_v10_part2.employername
"""
result_df_v11 = spark.sql(query)
display(result_df_v11)

In [0]:
def get_set_type(idx_in_partition, max_idx_in_partition):
    if idx_in_partition == max_idx_in_partition:
        return "test"
    else:
        return "train"
    
result_df_v12 = result_df_v11.withColumn("set_type", F.udf(get_set_type, returnType=StringType())(F.col("idx_in_partition"), F.col("max_idx_in_partition")))
display(result_df_v12)


In [0]:
from pyspark.sql import functions as F

# Suppose your DataFrame is called df
filtered_df = (
    result_df_v12.groupBy("userid", "employername", "partition_id", "set_type")
      .agg(F.count("*").alias("cnt"))
      .filter(((F.col("cnt") >= 12) & (F.col('set_type')=="train")) | ((F.col('set_type')=="test") & (F.col("cnt") >= 4)))  # drop the count column if you don’t need it
)
display(filtered_df)
# Join back with original df to keep only valid groups
# result_df_v13 = result_df_v12.join(filtered_df, on=["userid", "employername", "partitionid"], how="inner")


In [0]:
display(filtered_df.where("userid = '14789443'"))

In [0]:
filtered_df.count()

In [0]:
filtered_df.createOrReplaceTempView("filtered_df")
result_df_v12.createOrReplaceTempView("result_df_v12")

In [0]:
query = """
select result_df_v12.*, filtered_df.cnt as cnt from result_df_v12 join filtered_df on result_df_v12.userid = filtered_df.userid and result_df_v12.partition_id = filtered_df.partition_id and result_df_v12.employername = filtered_df.employername and result_df_v12.set_type = filtered_df.set_type
"""


result_df_v13 = spark.sql(query)
display(result_df_v13)

In [0]:
result_df_v13.createOrReplaceTempView("result_df_v13")

In [0]:
result_df_v12.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.all_traj_test_data_v12")

In [0]:
result_df_v12.createOrReplaceTempView("result_df_v12")

In [0]:
result_df_v13.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.la_traj_data_v13")

In [0]:
result_df_part1.createOrReplaceTempView("all_traj_data_v9_part1")

In [0]:
query = """
select all_traj_data_v9_part1.*, result_df_v12.employername as employername, result_df_v12.partition_id as partition_id, result_df_v12.pck_amt as pck_amt, result_df_v12.set_type as set_type from all_traj_data_v9_part1 join result_df_v12 on all_traj_data_v9_part1.userid = result_df_v12.userid and all_traj_data_v9_part1.traj_date = result_df_v12.traj_date"""
result_df_v14 = spark.sql(query)
display(result_df_v14)

In [0]:

result_df_v14.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_traj_data_v14")

In [0]:
result_df_v14 = spark.read.table("main_prod.datascience_scratchpad.nyc_traj_data_v14")

In [0]:
result_df_v14 = result_df_v14.withColumnRenamed("polylines_final", "wgs_seq")
result_df_v14 = result_df_v14.filter(col("wgs_seq").isNotNull())
display(result_df_v14)

In [0]:
from pyspark.sql.functions import expr
result_df_v15 = result_df_v14.withColumn("trajlen", expr("size(wgs_seq)"))

display(result_df_v15)

In [0]:
import math
def lonlat2meters(lon, lat):
    semimajoraxis = 6378137.0
    east = lon * 0.017453292519943295
    north = lat * 0.017453292519943295
    t = math.sin(north)
    return semimajoraxis * east, 3189068.5 * math.log((1 + t) / (1 - t))

In [0]:
lonlat2meters_udf = udf(lambda traj: [list(lonlat2meters(p[0], p[1])) for p in traj], ArrayType(ArrayType(DoubleType())))
result_df_v16 = result_df_v15.withColumn("merc_seq", lonlat2meters_udf(col("wgs_seq")))
display(result_df_v16)

In [0]:
result_df_v16.select("userid").distinct().count()

In [0]:
result_df_v16.select('employername').distinct().count()

In [0]:
# total partitions

result_df_v16.select('userid','employername','partition_id').distinct().count()


In [0]:
result_df_v16.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_traj_data_v16")

In [0]:
result_df_v16.count()

In [0]:
result_df_v16_pd = result_df_v16.toPandas()
result_df_v16_pd.head()

In [0]:
def get_weekday(traj_date):
    return traj_date.weekday()


get_weekday_udf = udf(lambda traj_date: get_weekday(traj_date), IntegerType())
result_df_v17 = result_df_v14.withColumn("weekday", get_weekday_udf(col("traj_date")))
display(result_df_v17)

In [0]:
result_df_v17.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.all_traj_test_with_pck")

In [0]:
display(result_df_v17.where("userid  = '5240'"))

In [0]:
result_df_v17.write.mode("overwrite").parquet("/Volumes/main_prod/datascience_scratchpad/jatin/trajcl_exp/nyc/hyp_2/df_v17")

In [0]:
result_df_v17.count()

In [0]:
result_df_v17.select('userid').distinct().count()

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType

def ts_to_min_index_list(ts_list):
    output = []
    for ts in ts_list:
        output.append(((ts.hour * 60) + ts.minute)//10)
    return output


udf_ts_to_min_index_list = udf(ts_to_min_index_list, ArrayType(IntegerType()))
result_df_v18 = result_df_v17.withColumn('time_index_list', udf_ts_to_min_index_list('timestamps'))
display(result_df_v18)

In [0]:
result_df_v18.write.mode("overwrite").saveAsTable("main_prod.datascience_scratchpad.nyc_traj_data_v18")