# Init PySpark

In [2]:
import os
import sys
import re

from pyspark.sql import SparkSession, SQLContext
from pyspark import  SparkContext, SparkConf
from pyspark.streaming import StreamingContext

# Spark session & context
conf = SparkConf()
conf.setMaster("local[4]").setAppName("preprocessing-job")
conf.set("spark.executor.memory", "4g")
conf.set("spark.executor.cores", "1")
conf.set("spark.cores.max", "4")
conf.set("spark.driver.memory",'4g')
conf.set("spark.driver.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.executor.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.sql.caseSensitive", "true")
conf.set("spark.ui.port", "4040")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 1)

In [3]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
.container { 
    width:95% !important; 
}
</style>

# Define static variables

In [4]:
TIME_BIN_SECONDS = 600 # seconds
DAY_TYPE_WEEKDAY = 0
DAY_TYPE_WEEKEND = 0
DATA_ACTUAL_TIMEZONE = "America/Los_Angeles"

STATIC_DATA_DIR = "hdfs://namenode:8020/ola/static_data/"
HISTORICAL_DATA_DIR = "hdfs://namenode:8020/ola/historical_data/"
AGGREGATED_DATA_DIR = "hdfs://namenode:8020/ola/aggregated_data/"

LOCAL_STATIC_DATA_DIR = "/home/data/static_data/"
LOCAL_HISTORICAL_DATA_DIR = "/home/data/historical_data/"
LOCAL_AGGREGATED_DATA_DIR = "/home/data/aggregated_data/"

# Define utilities

In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

from datetime import datetime
from pytz import timezone
import math

## Functions

In [6]:
# utils
def fhaversine_meter(lat1, lon1, lat2, lon2):
    # distance between latitudes and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0
 
    # apply formulae
    a = (math.pow(math.sin(dLat / 2), 2) +
         math.pow(math.sin(dLon / 2), 2) *
             math.cos(lat1) * math.cos(lat2));
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c * 1000

## Udf functions

In [7]:
haversine_meter = udf(lambda a, b, c, d: fhaversine_meter(a, b, c, d), DoubleType())

# Process static data

In [8]:
routes = spark.read.csv(STATIC_DATA_DIR + 'routes.txt', header=True)
routes.show(10)

+--------+----------------+----------------+--------------------+----------+-----------+----------------+---------+
|route_id|route_short_name| route_long_name|          route_desc|route_type|route_color|route_text_color|route_url|
+--------+----------------+----------------+--------------------+----------+-----------+----------------+---------+
| 2-13139|               2|Metro Local Line|DOWNTOWN LA - WES...|         3|       null|            null|     null|
| 4-13139|               4|Metro Local Line|DOWNTOWN LA - SAN...|         3|       null|            null|     null|
|10-13139|           10/48|Metro Local Line|W HOLLYWOOD-DTWN ...|         3|       null|            null|     null|
|14-13139|           14/37|Metro Local Line|BEVERLY HLLS-DTWN...|         3|       null|            null|     null|
|16-13139|              16|Metro Local Line|DOWNTOWN LA - CEN...|         3|       null|            null|     null|
|18-13139|              18|Metro Local Line|WILSHIRE WESTERN ...|       

In [9]:
trips = spark.read.csv(STATIC_DATA_DIR + 'trips.txt', header=True)
trips.show(10, False)

+---------+----------------------+-------------------------------+-------------+------------+--------+-------------+--------------+
|route_id |service_id            |trip_id                        |trip_headsign|direction_id|block_id|shape_id     |perm_trip_id  |
+---------+----------------------+-------------------------------+-------------+------------+--------+-------------+--------------+
|611-13139|DEC20-D02CAR-1_Weekday|52088401-DEC20-D02CAR-1_Weekday|null         |1           |6110300 |6110028_DEC20|10611000280539|
|611-13139|DEC20-D02CAR-1_Weekday|52088402-DEC20-D02CAR-1_Weekday|null         |1           |6110100 |6110029_DEC20|10611000290605|
|611-13139|DEC20-D02CAR-1_Weekday|52088403-DEC20-D02CAR-1_Weekday|null         |1           |6110200 |6110029_DEC20|10611000290705|
|611-13139|DEC20-D02CAR-1_Weekday|52088404-DEC20-D02CAR-1_Weekday|null         |1           |6110300 |6110029_DEC20|10611000290801|
|611-13139|DEC20-D02CAR-1_Weekday|52088405-DEC20-D02CAR-1_Weekday|null      

In [10]:
shapes = spark.read.csv(STATIC_DATA_DIR + 'shapes.txt', header=True)
shapes.show(10, False)

+-----------+-------------+---------------+-----------------+
|shape_id   |shape_pt_lat |shape_pt_lon   |shape_pt_sequence|
+-----------+-------------+---------------+-----------------+
|20999_DEC20|34.0455868426|-118.2530497433|10001            |
|20999_DEC20|34.0466699681|-118.252038928 |10002            |
|20999_DEC20|34.0481153251|-118.2507294304|10003            |
|20999_DEC20|34.0482659327|-118.2505852918|10004            |
|20999_DEC20|34.0482659327|-118.2505852918|20001            |
|20999_DEC20|34.0495085066|-118.2493966237|20002            |
|20999_DEC20|34.0495250347|-118.2493768658|20003            |
|20999_DEC20|34.0509649284|-118.2480375502|20004            |
|20999_DEC20|34.0511799592|-118.2478318593|20005            |
|20999_DEC20|34.0511799592|-118.2478318593|30001            |
+-----------+-------------+---------------+-----------------+
only showing top 10 rows



In [11]:
stops = spark.read.csv(STATIC_DATA_DIR + 'stops.txt', header=True)
stops.show(5)

+-------+---------+--------------------+---------+-----------+-----------+--------+-------------+--------------+---------+
|stop_id|stop_code|           stop_name|stop_desc|   stop_lat|   stop_lon|stop_url|location_type|parent_station|tpis_name|
+-------+---------+--------------------+---------+-----------+-----------+--------+-------------+--------------+---------+
|      1|        1| Paramount / Slauson|     null|  33.973248|-118.113113|    null|         null|          null|     null|
|      3|        3|    Jefferson / 10th|     null|  34.025471|-118.328402|    null|         null|          null|     null|
|      6|        6|120th / Augustus ...|     null|  33.924696|-118.242222|    null|         null|          null|     null|
|      7|        7|120th / Martin Lu...|     null|  33.924505|-118.240369|    null|         null|          null|     null|
|     12|       12|   15054 Sherman Way|     null|  34.201075|-118.461953|    null|         null|          null|     null|
+-------+-------

In [12]:
calendar = spark.read.csv(STATIC_DATA_DIR + 'calendar.txt', header=True)
calendar.show(5)

+--------------------+------+-------+---------+--------+------+--------+------+----------+--------+
|          service_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|start_date|end_date|
+--------------------+------+-------+---------+--------+------+--------+------+----------+--------+
|DEC20-D01CAR-3_Su...|     0|      0|        0|       0|     0|       0|     1|  20201213|20210620|
|DEC20-D01CAR-1_We...|     1|      1|        1|       1|     1|       0|     0|  20201214|20210625|
|DEC20-D01CAR-2_Sa...|     0|      0|        0|       0|     0|       1|     0|  20201219|20210626|
|DEC20-D02CAR-3_Su...|     0|      0|        0|       0|     0|       0|     1|  20201213|20210620|
|DEC20-D02CAR-1_We...|     1|      1|        1|       1|     1|       0|     0|  20201214|20210625|
+--------------------+------+-------+---------+--------+------+--------+------+----------+--------+
only showing top 5 rows



In [13]:
calendar_dates = spark.read.csv(STATIC_DATA_DIR + 'calendar_dates.txt', header=True)
calendar_dates.show(5)

+--------------------+--------+--------------+
|          service_id|    date|exception_type|
+--------------------+--------+--------------+
|DEC20-D01CAR-3_Su...|20201225|             1|
|DEC20-D01CAR-3_Su...|20210101|             1|
|DEC20-D01CAR-3_Su...|20210531|             1|
|DEC20-D01CAR-1_We...|20201225|             2|
|DEC20-D01CAR-1_We...|20210101|             2|
+--------------------+--------+--------------+
only showing top 5 rows



In [14]:
stop_times = spark.read.csv(STATIC_DATA_DIR + 'stop_times.txt', header=True)
stop_times.show(5)

+--------------------+------------+--------------+-------+-------------+--------------------+-----------+-------------+
|             trip_id|arrival_time|departure_time|stop_id|stop_sequence|       stop_headsign|pickup_type|drop_off_type|
+--------------------+------------+--------------+-------+-------------+--------------------+-----------+-------------+
|52088401-DEC20-D0...|    05:39:00|      05:39:00|  10246|            1|611 - Vernon Station|          0|            0|
|52088401-DEC20-D0...|    05:40:00|      05:40:00|  10248|            2|611 - Vernon Station|          0|            0|
|52088401-DEC20-D0...|    05:41:00|      05:41:00|   9371|            3|611 - Vernon Station|          0|            0|
|52088401-DEC20-D0...|    05:42:00|      05:42:00|   9350|            4|611 - Vernon Station|          0|            0|
|52088401-DEC20-D0...|    05:43:00|      05:43:00|   9351|            5|611 - Vernon Station|          0|            0|
+--------------------+------------+-----

In [15]:
route_shape_occurences = trips\
    .groupBy("route_id", "shape_id")\
    .count()

route_shape_occurences.show(10)

+---------+-------------+-----+
| route_id|     shape_id|count|
+---------+-------------+-----+
|163-13139|1630129_DEC20|   57|
| 16-13139| 160379_DEC20|  138|
|117-13139|1170068_DEC20|   84|
|251-13139|2510148_DEC20|   23|
| 10-13139| 100567_DEC20|   45|
| 10-13139| 100613_DEC20|   41|
|200-13139|2000041_DEC20|   24|
|734-13139|7340034_DEC20|    2|
|217-13139|2170234_DEC20|    6|
| 20-13139| 200723_DEC20|  119|
+---------+-------------+-----+
only showing top 10 rows



In [16]:
routes_with_shape = route_shape_occurences\
    .withColumn('rn', row_number().over(Window.partitionBy('route_id').orderBy(col('count').desc())))\
    .where('rn = 1')\
    .select("route_id", "shape_id")\
    .join(shapes, "shape_id")\
    .select("route_id", col("shape_pt_lat").cast("double"), col("shape_pt_lon").cast("double"), col("shape_pt_sequence").cast("int"))

routes_with_shape.show(10)

+--------+-------------+---------------+-----------------+
|route_id| shape_pt_lat|   shape_pt_lon|shape_pt_sequence|
+--------+-------------+---------------+-----------------+
| 2-13139|34.0636651084|-118.4467287702|            10001|
| 2-13139|34.0636633497|-118.4466098971|            10002|
| 2-13139|34.0636739601|-118.4459694103|            10003|
| 2-13139| 34.063695534|-118.4453355852|            10004|
| 2-13139|34.0636989493|-118.4450054263|            10005|
| 2-13139|34.0636989493|-118.4450054263|            20001|
| 2-13139|  34.06369973|-118.4449426967|            20002|
| 2-13139|34.0636910845| -118.444305409|            20003|
| 2-13139|34.0636945852|-118.4441007178|            20004|
| 2-13139|34.0637298522|  -118.44347687|            20005|
+--------+-------------+---------------+-----------------+
only showing top 10 rows



In [17]:
route_segments = routes_with_shape\
    .withColumn("id", monotonically_increasing_id())\
    .select("*", *[lead(col(c)).over(Window.orderBy("id")).alias("next_" + c) for c in ["shape_pt_lat", "shape_pt_lon", "shape_pt_sequence"]])\
    .drop("id")\
    .dropna("any")\
    .where(floor(col("shape_pt_sequence") / 10000) == floor(col("next_shape_pt_sequence") / 10000))\
    .select(
        split("route_id", "-").getItem(0).alias("route_id"), 
        col("shape_pt_lat").alias("segment_first_lat"), col("shape_pt_lon").alias("segment_first_lon"),
        col("next_shape_pt_lat").alias("segment_second_lat"), col("next_shape_pt_lon").alias("segment_second_lon"), 
        col("shape_pt_sequence").alias("segment_sequence"), floor(col("shape_pt_sequence") / 20000 + 0.5).alias("segment_id"),
        haversine_meter("shape_pt_lat", "shape_pt_lon", "next_shape_pt_lat", "next_shape_pt_lon").alias("segment_len_meter")
    )\
    .withColumn("segment_cum_len", sum("segment_len_meter").over(Window.partitionBy("route_id").orderBy("segment_sequence").rangeBetween(Window.unboundedPreceding, 0)))

route_segments.cache()
route_segments.printSchema()
route_segments.show(10)
route_segments.count()

root
 |-- route_id: string (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_sequence: integer (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- segment_len_meter: double (nullable = true)
 |-- segment_cum_len: double (nullable = true)

+--------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+
|route_id|segment_first_lat|segment_first_lon|segment_second_lat|segment_second_lon|segment_sequence|segment_id| segment_len_meter|   segment_cum_len|
+--------+-----------------+-----------------+------------------+------------------+----------------+----------+------------------+------------------+
|     125|    33.9058578172|  -118.3960444204|     33.9067250829|   -118.3959476739|           10001|         1| 96.84

60198

In [18]:
# write to static_data
route_segments.coalesce(1).write.mode("overwrite").json(LOCAL_STATIC_DATA_DIR + "route_segments.json")
route_segments.coalesce(1).write.mode("overwrite").json(STATIC_DATA_DIR + "route_segments.json")