# Define static variables

In [3]:
MASTER_ADDRESS = "master.internal.cloudapp.net"

DAY_TYPE_WEEKDAY = 0
DAY_TYPE_WEEKEND = 1
DATA_ACTUAL_TIMEZONE = "America/Los_Angeles"

STATIC_DATA_DIR = f"hdfs://{MASTER_ADDRESS}:8020/ola/static_data/"
HISTORICAL_DATA_DIR = f"hdfs://{MASTER_ADDRESS}:8020/ola/historical_data/"
AGGREGATED_DATA_DIR = f"hdfs://{MASTER_ADDRESS}:8020/ola/aggregated_data/"

# Init PySpark

In [None]:
import findspark
findspark.init()

In [1]:
import os
import sys
import re

from pyspark.sql import SparkSession
from pyspark import  SparkContext, SparkConf

# Spark session & context
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("preprocessing-job")

conf.set("spark.cores.max", "1")
conf.set("spark.default.parallelism", "2")
conf.set("spark.driver.memory", "2g")

conf.set("spark.driver.extraClassPath", "/root/libs/spark-3.1.1-bin-hadoop3.2/third-party-jars/*")
conf.set("spark.executor.extraClassPath", "/root/libs/spark-3.1.1-bin-hadoop3.2/third-party-jars/*")

conf.set("spark.sql.caseSensitive", "true")
conf.set("spark.ui.port", "4042")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [2]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
.container { 
    width:95% !important; 
}
</style>

# Define utilities

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

from datetime import datetime
from pytz import timezone
import math

## Functions

In [5]:
# utils
def fhaversine_meter(lat1, lon1, lat2, lon2):
    # distance between latitudes and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0
 
    # apply formulae
    a = (math.pow(math.sin(dLat / 2), 2) +
         math.pow(math.sin(dLon / 2), 2) *
             math.cos(lat1) * math.cos(lat2));
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c * 1000

## Udf functions

In [6]:
haversine_meter = udf(lambda a, b, c, d: fhaversine_meter(a, b, c, d), DoubleType())

# Process static data

In [7]:
trips = spark.read.csv(STATIC_DATA_DIR + 'trips.txt', header=True)
trips.show(10, False)

+---------+----------------------+-------------------------------+-------------+------------+--------+-------------+--------------+
|route_id |service_id            |trip_id                        |trip_headsign|direction_id|block_id|shape_id     |perm_trip_id  |
+---------+----------------------+-------------------------------+-------------+------------+--------+-------------+--------------+
|611-13139|DEC20-D02CAR-1_Weekday|52088401-DEC20-D02CAR-1_Weekday|null         |1           |6110300 |6110028_DEC20|10611000280539|
|611-13139|DEC20-D02CAR-1_Weekday|52088402-DEC20-D02CAR-1_Weekday|null         |1           |6110100 |6110029_DEC20|10611000290605|
|611-13139|DEC20-D02CAR-1_Weekday|52088403-DEC20-D02CAR-1_Weekday|null         |1           |6110200 |6110029_DEC20|10611000290705|
|611-13139|DEC20-D02CAR-1_Weekday|52088404-DEC20-D02CAR-1_Weekday|null         |1           |6110300 |6110029_DEC20|10611000290801|
|611-13139|DEC20-D02CAR-1_Weekday|52088405-DEC20-D02CAR-1_Weekday|null      

In [8]:
shapes = spark.read.csv(STATIC_DATA_DIR + 'shapes.txt', header=True)
shapes.show(10, False)

+-----------+-------------+---------------+-----------------+
|shape_id   |shape_pt_lat |shape_pt_lon   |shape_pt_sequence|
+-----------+-------------+---------------+-----------------+
|20999_DEC20|34.0455868426|-118.2530497433|10001            |
|20999_DEC20|34.0466699681|-118.252038928 |10002            |
|20999_DEC20|34.0481153251|-118.2507294304|10003            |
|20999_DEC20|34.0482659327|-118.2505852918|10004            |
|20999_DEC20|34.0482659327|-118.2505852918|20001            |
|20999_DEC20|34.0495085066|-118.2493966237|20002            |
|20999_DEC20|34.0495250347|-118.2493768658|20003            |
|20999_DEC20|34.0509649284|-118.2480375502|20004            |
|20999_DEC20|34.0511799592|-118.2478318593|20005            |
|20999_DEC20|34.0511799592|-118.2478318593|30001            |
+-----------+-------------+---------------+-----------------+
only showing top 10 rows



In [9]:
stops = spark.read.csv(STATIC_DATA_DIR + 'stops.txt', header=True)
stops.show(10, False)

+-------+---------+-----------------------------------+---------+-----------+-----------+--------+-------------+--------------+---------+
|stop_id|stop_code|stop_name                          |stop_desc|stop_lat   |stop_lon   |stop_url|location_type|parent_station|tpis_name|
+-------+---------+-----------------------------------+---------+-----------+-----------+--------+-------------+--------------+---------+
|1      |1        |Paramount / Slauson                |null     |  33.973248|-118.113113|null    |null         |null          |null     |
|3      |3        |Jefferson / 10th                   |null     |  34.025471|-118.328402|null    |null         |null          |null     |
|6      |6        |120th / Augustus F Hawkins         |null     |  33.924696|-118.242222|null    |null         |null          |null     |
|7      |7        |120th / Martin Luther King Hospital|null     |  33.924505|-118.240369|null    |null         |null          |null     |
|12     |12       |15054 Sherman W

In [10]:
stop_times = spark.read.csv(STATIC_DATA_DIR + 'stop_times.txt', header=True)
stop_times.show(10, False)

+-------------------------------+------------+--------------+-------+-------------+--------------------+-----------+-------------+
|trip_id                        |arrival_time|departure_time|stop_id|stop_sequence|stop_headsign       |pickup_type|drop_off_type|
+-------------------------------+------------+--------------+-------+-------------+--------------------+-----------+-------------+
|52088401-DEC20-D02CAR-1_Weekday|05:39:00    |05:39:00      |10246  |1            |611 - Vernon Station|0          |0            |
|52088401-DEC20-D02CAR-1_Weekday|05:40:00    |05:40:00      |10248  |2            |611 - Vernon Station|0          |0            |
|52088401-DEC20-D02CAR-1_Weekday|05:41:00    |05:41:00      |9371   |3            |611 - Vernon Station|0          |0            |
|52088401-DEC20-D02CAR-1_Weekday|05:42:00    |05:42:00      |9350   |4            |611 - Vernon Station|0          |0            |
|52088401-DEC20-D02CAR-1_Weekday|05:43:00    |05:43:00      |9351   |5            |

In [12]:
route_shape_occurences = trips\
    .groupBy("route_id", "shape_id")\
    .count()

route_shape_occurences.show(10)

+---------+-------------+-----+
| route_id|     shape_id|count|
+---------+-------------+-----+
|163-13139|1630129_DEC20|   57|
| 16-13139| 160379_DEC20|  138|
|117-13139|1170068_DEC20|   84|
|251-13139|2510148_DEC20|   23|
| 10-13139| 100567_DEC20|   45|
| 10-13139| 100613_DEC20|   41|
|200-13139|2000041_DEC20|   24|
|734-13139|7340034_DEC20|    2|
|217-13139|2170234_DEC20|    6|
| 20-13139| 200723_DEC20|  119|
+---------+-------------+-----+
only showing top 10 rows



In [20]:
routes_with_shape = route_shape_occurences\
    .withColumn('rn', row_number().over(Window.partitionBy('route_id').orderBy(col('count').desc())))\
    .where('rn = 1')\
    .select("route_id", "shape_id")\
    .join(shapes, "shape_id")\
    .select("route_id", "shape_id", col("shape_pt_lat").cast("double"), col("shape_pt_lon").cast("double"), col("shape_pt_sequence").cast("int"))

routes_with_shape.show(10)

+--------+-----------+-------------+---------------+-----------------+
|route_id|   shape_id| shape_pt_lat|   shape_pt_lon|shape_pt_sequence|
+--------+-----------+-------------+---------------+-----------------+
| 2-13139|21084_DEC20|34.0636651084|-118.4467287702|            10001|
| 2-13139|21084_DEC20|34.0636633497|-118.4466098971|            10002|
| 2-13139|21084_DEC20|34.0636739601|-118.4459694103|            10003|
| 2-13139|21084_DEC20| 34.063695534|-118.4453355852|            10004|
| 2-13139|21084_DEC20|34.0636989493|-118.4450054263|            10005|
| 2-13139|21084_DEC20|34.0636989493|-118.4450054263|            20001|
| 2-13139|21084_DEC20|  34.06369973|-118.4449426967|            20002|
| 2-13139|21084_DEC20|34.0636910845| -118.444305409|            20003|
| 2-13139|21084_DEC20|34.0636945852|-118.4441007178|            20004|
| 2-13139|21084_DEC20|34.0637298522|  -118.44347687|            20005|
+--------+-----------+-------------+---------------+-----------------+
only s

In [21]:
route_segments = routes_with_shape\
    .withColumn("id", monotonically_increasing_id())\
    .select("*", *[lead(col(c)).over(Window.orderBy("id")).alias("next_" + c) for c in ["shape_pt_lat", "shape_pt_lon", "shape_pt_sequence"]])\
    .drop("id")\
    .dropna("any")\
    .where(floor(col("shape_pt_sequence") / 10000) == floor(col("next_shape_pt_sequence") / 10000))\
    .select(
        split("route_id", "-").getItem(0).alias("route_id"), 
        col("shape_pt_lat").alias("segment_first_lat"), col("shape_pt_lon").alias("segment_first_lon"),
        col("next_shape_pt_lat").alias("segment_second_lat"), col("next_shape_pt_lon").alias("segment_second_lon"), 
        col("shape_pt_sequence").alias("segment_sequence"), floor(col("shape_pt_sequence") / 20000 + 0.5).alias("segment_id"),
        haversine_meter("shape_pt_lat", "shape_pt_lon", "next_shape_pt_lat", "next_shape_pt_lon").alias("segment_len_meter")
    )\
    .withColumn("segment_cum_len", sum("segment_len_meter").over(Window.partitionBy("route_id").orderBy("segment_sequence").rangeBetween(Window.unboundedPreceding, 0)))

route_segments.printSchema()
route_segments.show(10, False)

+--------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+
|route_id|segment_first_lat|segment_first_lon|segment_second_lat|segment_second_lon|segment_sequence|segment_id|segment_len_meter |segment_cum_len   |
+--------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+
|125     |33.9058578172    |-118.3960444204  |33.9067250829     |-118.3959476739   |10001           |1         |96.84797621031923 |96.84797621031923 |
|125     |33.9067250829    |-118.3959476739  |33.907843507      |-118.395933184    |10002           |1         |124.37027464152683|221.21825085184605|
|125     |33.907843507     |-118.395933184   |33.9078312067     |-118.3954948079   |10003           |1         |40.47850790950947 |261.6967587613555 |
|125     |33.9078312067    |-118.3954948079  |33.9078294424     |-118.3951948978   |10004     

In [22]:
stops_per_trip = stop_times\
    .groupBy("trip_id")\
    .agg(count("stop_id").alias("stops"))\

stops_per_trip.show(10, False)

+--------------------------------+-----+
|trip_id                         |stops|
+--------------------------------+-----+
|52162231-DEC20-D03CAR-3_Sunday  |82   |
|52162565-DEC20-D03CAR-1_Weekday |82   |
|52162666-DEC20-D07CAR-1_Weekday |96   |
|52162715-DEC20-D03CAR-1_Weekday |96   |
|52180398-DEC20-D08CAR-1_Weekday |133  |
|52180440-DEC20-D08CAR-3_Sunday  |56   |
|52182286-DEC20-D98-1_Weekday    |11   |
|52188390-DEC20-D15CAR-2_Saturday|72   |
|52188396-DEC20-D15CAR-2_Saturday|121  |
|52189151-DEC20-D15CAR-3_Sunday  |83   |
+--------------------------------+-----+
only showing top 10 rows



In [23]:
longest_trip_per_shape = stops_per_trip\
    .join(trips, "trip_id")\
    .select("trip_id", "shape_id", "stops")\
    .withColumn("rn", row_number().over(Window.partitionBy("shape_id").orderBy(desc("stops"))))\
    .where("rn = 1")\
    .drop("rn")\

longest_trip_per_shape.show(10, False)

+--------------------------------+-------------+-----+
|trip_id                         |shape_id     |stops|
+--------------------------------+-------------+-----+
|52287954-DEC20-D03CAR-3_Sunday  |1800196_DEC20|94   |
|52220628-DEC20-D09CAR-1_Weekday |2600273_DEC20|79   |
|52305140-DEC20-D18CAR-1_Weekday |3440049_DEC20|43   |
|52264682-DEC20-D07CAR-1_Weekday |100597_DEC20 |66   |
|52281351-DEC20-D07CAR-2_Saturday|100608_DEC20 |62   |
|52224325-DEC20-D15CAR-2_Saturday|2240250_DEC20|32   |
|52282050-DEC20-D07CAR-2_Saturday|300797_DEC20 |59   |
|52162665-DEC20-D03CAR-1_Weekday |280614_DEC20 |46   |
|52266006-DEC20-D13CAR-1_Weekday |330541_DEC20 |78   |
|52202031-DEC20-D01CAR-3_Sunday  |620319_DEC20 |81   |
+--------------------------------+-------------+-----+
only showing top 10 rows



In [42]:
route_stops = longest_trip_per_shape\
    .join(
        routes_with_shape\
            .select("route_id", "shape_id")\
            .distinct(), 
        "shape_id"
    )\
    .join(stop_times, "trip_id")\
    .select("route_id", "stop_id", "stop_sequence")\
    .join(stops, "stop_id")\
    .select(
        split("route_id", "-").getItem(0).alias("route_id"), 
        "stop_id",
        col("stop_lat").cast("double"),
        col("stop_lon").cast("double"),
        col("stop_sequence").cast("int")
    )\
    .join(route_segments, "route_id")\
    .withColumn("stop_segment_distance", haversine_meter("stop_lat", "stop_lon", "segment_first_lat", "segment_first_lon") + haversine_meter("stop_lat", "stop_lon", "segment_second_lat", "segment_second_lon") - col("segment_len_meter"))\
    .withColumn("row_number", row_number().over(Window.partitionBy("route_id", "stop_id").orderBy(asc("stop_segment_distance"))))\
    .where("row_number = 1")\
    .drop("row_number")

route_stops.printSchema()
route_stops\
    .orderBy("route_id", "stop_sequence")\
    .show()

root
 |-- route_id: string (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- stop_lat: double (nullable = true)
 |-- stop_lon: double (nullable = true)
 |-- stop_sequence: integer (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_sequence: integer (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- segment_len_meter: double (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- stop_segment_distance: double (nullable = true)

+--------+-------+---------+-----------+-------------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+---------------------+
|route_id|stop_id| stop_lat|   stop_lon|stop_sequence|segment_first_lat|segment_first_lon|segment_second_lat|segment_second_lon|segment_

In [43]:
# write to static_data
route_segments.cache()
route_segments\
    .write\
    .mode("overwrite")\
    .json(STATIC_DATA_DIR + "route_segments.json")

In [45]:
route_stops.cache()
route_stops\
    .write\
    .mode("overwrite")\
    .json(STATIC_DATA_DIR + "route_stops.json")

In [48]:
route_stops\
    .orderBy("route_id", "stop_sequence")\
    .show(10000)

+--------+--------+-----------+------------+-------------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+---------------------+
|route_id| stop_id|   stop_lat|    stop_lon|stop_sequence|segment_first_lat|segment_first_lon|segment_second_lat|segment_second_lon|segment_sequence|segment_id| segment_len_meter|   segment_cum_len|stop_segment_distance|
+--------+--------+-----------+------------+-------------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+---------------------+
|      10|   14915|  34.033504| -118.265139|            1|    34.0334863029|  -118.2651011163|     34.0331273498|   -118.2653884468|           10001|         1|47.897240942126935|47.897240942126935|    3.884916869835486|
|      10|    3121|  34.033659| -118.262347|            2|    34.0337311109|  -118.2624723293|     34.0343025744|   