# Static variables

In [1]:
MASTER_INTERNAL_ADDRESS = "10.104.0.2"

DAY_TYPE_WEEKDAY = 0
DAY_TYPE_WEEKEND = 1
DATA_ACTUAL_TIMEZONE = "America/Los_Angeles"

MAX_SECONDS_SINCE_REPORT = 300
MIN_VELOCITY = 1 # m/s
MAX_VELOCITY = 18 # m/s

STATIC_DATA_DIR = f"hdfs://{MASTER_INTERNAL_ADDRESS}:8020/ola/static_data/"
HISTORICAL_DATA_DIR = f"hdfs://{MASTER_INTERNAL_ADDRESS}:8020/ola/historical_data/"
AGGREGATED_DATA_DIR = f"hdfs://{MASTER_INTERNAL_ADDRESS}:8020/ola/aggregated_data/"

# Init PySpark

In [2]:
import findspark
findspark.init()

In [3]:
import os
import sys
import re

from pyspark.sql import SparkSession
from pyspark import  SparkContext, SparkConf

# Spark session & context
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("batch-job")

conf.set("spark.cores.max", "1")
conf.set("spark.default.parallelism", "2")
conf.set("spark.driver.memory", "2g")

conf.set("spark.driver.extraClassPath", "/root/libs/spark-3.1.1-bin-hadoop3.2/third-party-jars/*")
conf.set("spark.executor.extraClassPath", "/root/libs/spark-3.1.1-bin-hadoop3.2/third-party-jars/*")

conf.set("spark.sql.caseSensitive", "true")
conf.set("spark.ui.port", "4041")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [4]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
.container { 
    width:95% !important; 
}
</style>

# Load static & aggregated data

### Functions

In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

from datetime import datetime
from pytz import timezone
import math

In [6]:
# check if segment is not an empty segment (two vertex is the same)
def fis_not_the_same(from_latlon, to_latlon):
    return (from_latlon[0] != to_latlon[0]) | (from_latlon[1] != to_latlon[1])

def ffile_path_to_ts(file_path):
    return int(file_path[-18:-4])

def fget_day_type(epoch_seconds):
    dt = datetime.fromtimestamp(epoch_seconds)
    wd = dt.astimezone(timezone(DATA_ACTUAL_TIMEZONE)).weekday()
    return DAY_TYPE_WEEKDAY if wd < 5 else DAY_TYPE_WEEKEND

def fhaversine_meter(lat1, lon1, lat2, lon2):
    # distance between latitudes and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0
 
    # apply formulae
    a = (math.pow(math.sin(dLat / 2), 2) +
         math.pow(math.sin(dLon / 2), 2) *
             math.cos(lat1) * math.cos(lat2));
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c * 1000

def ffind_distance(lat1, lon1, flat1, flon1, slat1, slon1, cl1, lat2, lon2, flat2, flon2, slat2, slon2, cl2):
    if cl1 == cl2:
        return fhaversine_meter(lat1, lon1, lat2, lon2)
        
    if cl1 > cl2:
        return ffind_distance(lat2, lon2, flat2, flon2, slat2, slon2, cl2, lat1, lon1, flat1, flon1, slat1, slon1, cl1)

    return fhaversine_meter(lat1, lon1, slat1, slon1) + cl2 - cl1 - fhaversine_meter(flat2, flon2, slat2, slon2) + fhaversine_meter(flat2, flon2, lat2, lon2)

def ffind_velocity_sign(cl1, cl2, lat1, lon1, lat2, lon2, flat, flon, slat, slon):
    if cl1 == cl2:
        d1 = fhaversine_meter(lat1, lon1, flat, flon)
        d2 = fhaversine_meter(lat2, lon2, slat, slon)
        d = fhaversine_meter(flat, flon, slat, slon)
        return 1 if d1 + d2 < d else -1
    return 1 if cl2 > cl1 else -1

In [7]:
#udf
is_not_the_same = udf(lambda a, b: ffis_not_the_same(a, b), BooleanType())

file_path_to_ts = udf(lambda a: ffile_path_to_ts(a), LongType())

get_day_type = udf(lambda a: fget_day_type(a), IntegerType())

haversine_meter = udf(lambda a, b, c, d: fhaversine_meter(a, b, c, d), DoubleType())

find_distance = udf(lambda a, b, c, d, e, f, g, h, p, q, r, s, t, u: ffind_distance(a, b, c, d, e, f, g, h, p, q, r, s, t, u), DoubleType())

find_velocity_sign = udf(lambda a, b, c, d, e, f, g, h, p, q: ffind_velocity_sign(a, b, c, d, e, f, g, h, p, q), IntegerType())

### Load static data

In [8]:
routes_segments = spark\
    .read\
    .json(STATIC_DATA_DIR + "route_segments.json")\
    .withColumn("route_id", col("route_id").cast("int").alias("route_id"))

routes_segments.cache()
routes_segments.printSchema()

root
 |-- route_id: integer (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- segment_len_meter: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_sequence: long (nullable = true)



# Load and process historical data

In [9]:
# Load data
bus_positions = spark\
    .read\
    .json(HISTORICAL_DATA_DIR + "*")\
    .dropna("any")

# Cast fields to int
bus_positions = bus_positions\
    .select(
        (col("timestamp") - col("seconds_since_report")).alias("timestamp"),
        col("id").cast("int").alias("bus_id"),
        col("route_id").cast("int").alias("route_id"),
        "run_id", "latitude", "longitude"
    )\

# With positions having same bus_id, run_id, latitude, longitude, order them by timestamp then drop consecutive duplicate rows
bus_positions = bus_positions\
   .select(
        "bus_id", "latitude", "longitude", "route_id", "run_id", "timestamp",
        *[lag(c).over(Window.partitionBy("bus_id", "route_id", "run_id").orderBy("timestamp")).alias("prev_" + c) for c in ["latitude", "longitude", "timestamp"]]
    )\
    .where((col("prev_latitude").isNull()) | (col("latitude") != col("prev_latitude")) | (col("longitude") != col("prev_longitude")))\
    .select("bus_id", "latitude", "longitude", "route_id", "run_id", "timestamp")

# add day_type field, 0 if is weekday, 1 if is weekend or holiday
bus_positions = bus_positions\
    .withColumn("day_type", get_day_type("timestamp"))

# get segment of bus positions
bus_positions = bus_positions\
    .join(routes_segments, "route_id")\
    .withColumn("bus_segment_distance", haversine_meter("latitude", "longitude", "segment_first_lat", "segment_first_lon") + haversine_meter("latitude", "longitude", "segment_second_lat", "segment_second_lon") - col("segment_len_meter"))\
    .withColumn("row_number", row_number().over(Window.partitionBy("bus_id", "route_id", "latitude", "longitude", "run_id", "timestamp").orderBy(asc("bus_segment_distance"))))\
    .where("row_number = 1")\
    .drop("row_number")\

# drop rows with at least one null column
bus_positions = bus_positions.dropna("any")

bus_positions.printSchema()

root
 |-- route_id: integer (nullable = true)
 |-- bus_id: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- run_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- day_type: integer (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- segment_len_meter: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_sequence: long (nullable = true)
 |-- bus_segment_distance: double (nullable = true)



In [10]:
# with bus positions having same "bus_id", "route_id", "run_id", "day_type", sort them in ascending order by "timestamp",
# then create a segment with every 2 consecutive rows with same "run_id".
# This ensure that we only calculate velocity for bus positions in the same trip.
bus_segments = bus_positions\
    .select(
        "bus_id", "route_id", "day_type", "run_id", "timestamp", "latitude", "longitude", "segment_first_lat", "segment_first_lon", "segment_second_lat", "segment_second_lon", "segment_cum_len", "segment_id",
        *[lead(column_name).over(Window.partitionBy("bus_id", "route_id", "day_type", "run_id").orderBy("timestamp")).alias("next_" + column_name) for column_name in ["timestamp", "latitude", "longitude", "segment_first_lat", "segment_first_lon", "segment_second_lat", "segment_second_lon", "segment_cum_len"]]
    )\
    .dropna("any")\

bus_segments.cache()
bus_segments.printSchema()

root
 |-- bus_id: integer (nullable = true)
 |-- route_id: integer (nullable = true)
 |-- day_type: integer (nullable = true)
 |-- run_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- next_timestamp: long (nullable = true)
 |-- next_latitude: double (nullable = true)
 |-- next_longitude: double (nullable = true)
 |-- next_segment_first_lat: double (nullable = true)
 |-- next_segment_first_lon: double (nullable = true)
 |-- next_segment_second_lat: double (nullable = true)
 |-- next_segment_second_lon: double (nullable = true)
 |-- next_segment_cum_len: double (nullable = true)



In [11]:
# with each segment, calculate distance and delta time
# then find the sign of velocity of this bus on this segment,
# 0 means the bus is running alongside with the segment sequence id
bus_segments_1 = bus_segments\
    .withColumn(
        "distance", 
        find_distance(
            "latitude", "longitude", "segment_first_lat", "segment_first_lon", 
            "segment_second_lat", "segment_second_lon", "segment_cum_len", 
            "next_latitude", "next_longitude", "next_segment_first_lat", 
            "next_segment_first_lon", "next_segment_second_lat", 
            "next_segment_second_lon", "next_segment_cum_len"
        )
    )\
    .where(col("distance") != 0)\
    .withColumn("delta_time", col("next_timestamp") - col("timestamp"))\
    .where(col("delta_time") <= MAX_SECONDS_SINCE_REPORT)\
    .where((col("distance") / col("delta_time") >= MIN_VELOCITY) & (col("distance") / col("delta_time") <= MAX_VELOCITY))\
    .withColumn("velocity_sign", find_velocity_sign("segment_cum_len", "next_segment_cum_len", "latitude", "longitude", "next_latitude", "next_longitude", "segment_first_lat", "segment_first_lon", "segment_second_lat", "segment_second_lon"))\
    
bus_segments_1.printSchema()

root
 |-- bus_id: integer (nullable = true)
 |-- route_id: integer (nullable = true)
 |-- day_type: integer (nullable = true)
 |-- run_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- next_timestamp: long (nullable = true)
 |-- next_latitude: double (nullable = true)
 |-- next_longitude: double (nullable = true)
 |-- next_segment_first_lat: double (nullable = true)
 |-- next_segment_first_lon: double (nullable = true)
 |-- next_segment_second_lat: double (nullable = true)
 |-- next_segment_second_lon: double (nullable = true)
 |-- next_segment_cum_len: double (nullable = true)
 |-- distance: double (nul

In [12]:
bus_velocities = bus_segments_1\
    .withColumn("direction", substring("run_id", -1, 1))\
    .groupBy("route_id", "direction", "day_type", "velocity_sign")\
    .agg(sum("distance").alias("total_distance"), sum("delta_time").alias("total_delta_time"))\
    .withColumn("row_number", row_number().over(Window.partitionBy("route_id", "direction", "day_type").orderBy(asc("total_distance"))))\
    .where("row_number = 1")\
    .drop("row_number")\
    .withColumn("velocity", col("total_distance") / col("total_delta_time"))\
    
bus_velocities.printSchema()

root
 |-- route_id: integer (nullable = true)
 |-- direction: string (nullable = true)
 |-- day_type: integer (nullable = true)
 |-- velocity_sign: integer (nullable = true)
 |-- total_distance: double (nullable = true)
 |-- total_delta_time: long (nullable = true)
 |-- velocity: double (nullable = true)



In [13]:
# cache before write (speed the write function)
# use sql because it will run rightaway
bus_velocities.createOrReplaceTempView("bus_velocities")
spark.sql("cache table bus_velocities")

DataFrame[]

In [14]:
# write to aggregated_data
bus_velocities\
    .select("route_id", "direction", "day_type", "velocity", "velocity_sign")\
    .write.mode("overwrite")\
    .json(AGGREGATED_DATA_DIR + "bus_velocities.json")

In [15]:
spark.sql("uncache table bus_velocities")

In [17]:
bus_velocities\
    .orderBy("route_id", "direction", "day_type", "velocity_sign")\
    .show()

+--------+---------+--------+-------------+------------------+----------------+------------------+
|route_id|direction|day_type|velocity_sign|    total_distance|total_delta_time|          velocity|
+--------+---------+--------+-------------+------------------+----------------+------------------+
|       2|        0|       0|           -1|22826.397357083908|            6166|3.7019781636529205|
|       2|        1|       0|            1|37646.688184160725|            6167| 6.104538379140704|
|       4|        0|       0|            1| 17549.90810148129|            4225| 4.153824402717465|
|       4|        1|       0|           -1| 38726.57245688513|            8462| 4.576527116152816|
|      10|        0|       0|            1|14333.984739505211|            3479| 4.120145081777871|
|      10|        1|       0|           -1|10194.044030633042|            2123| 4.801716453430543|
|      14|        0|       0|           -1|  6208.33024270626|            1934| 3.210098367479969|
|      14|