# Init PySpark

In [1]:
import os
import sys
import re

from pyspark.sql import SparkSession, SQLContext
from pyspark import  SparkContext, SparkConf
from pyspark.streaming import StreamingContext

# Spark session & context
conf = SparkConf()
conf.setMaster("local[4]").setAppName("stream-job")
conf.set("spark.executor.memory", "4g")
conf.set("spark.executor.cores", "1")
conf.set("spark.cores.max", "4")
conf.set("spark.driver.memory",'4g')
conf.set("spark.driver.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.executor.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.sql.caseSensitive", "true")
conf.set("spark.ui.port", "4040")

sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 1)

In [2]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
.container { 
    width:95% !important; 
}
</style>

# Define static variables

In [15]:
TIME_BIN_SECONDS = 600 # seconds
DAY_TYPE_WEEKDAY = 0
DAY_TYPE_WEEKEND = 0
DATA_ACTUAL_TIMEZONE = "America/Los_Angeles"

STATIC_DATA_DIR = "hdfs://namenode:8020/ola/static_data/"
HISTORICAL_DATA_DIR = "hdfs://namenode:8020/ola/historical_data/"
AGGREGATED_DATA_DIR = "hdfs://namenode:8020/ola/aggregated_data/"

LOCAL_STATIC_DATA_DIR = "/home/data/static_data/"
LOCAL_HISTORICAL_DATA_DIR = "/home/data/historical_data/"
LOCAL_AGGREGATED_DATA_DIR = "/home/data/aggregated_data/"

STATIC_DATA_DIR = LOCAL_STATIC_DATA_DIR
HISTORICAL_DATA_DIR = LOCAL_HISTORICAL_DATA_DIR
AGGREGATED_DATA_DIR = LOCAL_AGGREGATED_DATA_DIR

# Define utilities

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

from datetime import datetime
from pytz import timezone
import math

## Functions

In [5]:
# utils

# check if segment is not an empty segment (two vertex is the same)
def fis_not_the_same(from_latlon, to_latlon):
    return (from_latlon[0] != to_latlon[0]) | (from_latlon[1] != to_latlon[1])

def ffile_path_to_ts(file_path):
    return int(file_path[-18:-4])

def fget_time_bin(epoch_seconds):
    dt = datetime.fromtimestamp(epoch_seconds)
    dt = dt.astimezone(timezone(DATA_ACTUAL_TIMEZONE))
    return math.floor((dt.hour * 3600 + dt.minute * 60 + dt.second) / TIME_BIN_SECONDS)

def fget_day_type(epoch_seconds):
    dt = datetime.fromtimestamp(epoch_seconds)
    wd = dt.astimezone(timezone(DATA_ACTUAL_TIMEZONE)).weekday()
    return DAY_TYPE_WEEKDAY if wd < 5 else DAY_TYPE_WEEKEND

def fhaversine_meter(lat1, lon1, lat2, lon2):
    # distance between latitudes and longitudes
    dLat = (lat2 - lat1) * math.pi / 180.0
    dLon = (lon2 - lon1) * math.pi / 180.0

    # convert to radians
    lat1 = (lat1) * math.pi / 180.0
    lat2 = (lat2) * math.pi / 180.0
 
    # apply formulae
    a = (math.pow(math.sin(dLat / 2), 2) +
         math.pow(math.sin(dLon / 2), 2) *
             math.cos(lat1) * math.cos(lat2));
    rad = 6371
    c = 2 * math.asin(math.sqrt(a))
    return rad * c * 1000

def ffind_distance(lat1, lon1, flat1, flon1, slat1, slon1, cl1, lat2, lon2, flat2, flon2, slat2, slon2, cl2):
    if cl1 == cl2:
        return fhaversine_meter(lat1, lon1, lat2, lon2)
        
    if cl1 > cl2:
        return ffind_distance(lat2, lon2, flat2, flon2, slat2, slon2, cl2, lat1, lon1, flat1, flon1, slat1, slon1, cl1)

    return fhaversine_meter(lat1, lon1, slat1, slon1) + cl2 - cl1 - fhaversine_meter(flat2, flon2, slat2, slon2) + fhaversine_meter(flat2, flon2, lat2, lon2)

def readPostgreSQL(user, password, database, query):
    ip = "10.60.71.3"
    port = "5432"
    url = "jdbc:postgresql://" + ip + ":" + port + "/" + database 
    driver = "org.postgresql.Driver"
    
    return sqlContext.read\
        .format("jdbc")\
        .option("url", url)\
        .option("driver", driver)\
        .option("url", url)\
        .option("user", user)\
        .option("password", password)\
        .option("dbtable", "({}) as tmp".format(query))\
        .load()

In [6]:
print(fget_time_bin(1620795062))

131


In [7]:
print("ffind_distance:", ffind_distance(33.82962, -118.290314, 33.8292539188, -118.2902737058, 33.8296908268, -118.2902719251, 23232.5956910088, 33.794449, -118.290871, 33.7932864432, -118.2908173976, 33.7945806653, -118.2908159585, 19326.593863383834), "m")

ffind_distance: 3913.794621512769 m


## Udf functions

In [8]:
#udf
is_not_the_same = udf(lambda a, b: ffis_not_the_same(a, b), BooleanType())

file_path_to_ts = udf(lambda a: ffile_path_to_ts(a), LongType())

get_time_bin = udf(lambda a: fget_time_bin(a), IntegerType())

get_day_type = udf(lambda a: fget_day_type(a), IntegerType())

haversine_meter = udf(lambda a, b, c, d: fhaversine_meter(a, b, c, d), DoubleType())

find_distance = udf(lambda a, b, c, d, e, f, g, h, p, q, r, s, t, u: ffind_distance(a, b, c, d, e, f, g, h, p, q, r, s, t, u), DoubleType())

# Load static data

In [9]:
routes_segments = spark\
    .read\
    .json(STATIC_DATA_DIR + "route_segments.json")\

routes_segments.cache()
routes_segments.printSchema()
routes_segments.show(10)
routes_segments.count()

root
 |-- route_id: string (nullable = true)
 |-- segment_cum_len: double (nullable = true)
 |-- segment_first_lat: double (nullable = true)
 |-- segment_first_lon: double (nullable = true)
 |-- segment_id: long (nullable = true)
 |-- segment_len_meter: double (nullable = true)
 |-- segment_second_lat: double (nullable = true)
 |-- segment_second_lon: double (nullable = true)
 |-- segment_sequence: long (nullable = true)

+--------+------------------+-----------------+-----------------+----------+------------------+------------------+------------------+----------------+
|route_id|   segment_cum_len|segment_first_lat|segment_first_lon|segment_id| segment_len_meter|segment_second_lat|segment_second_lon|segment_sequence|
+--------+------------------+-----------------+-----------------+----------+------------------+------------------+------------------+----------------+
|     125| 96.84797621031923|    33.9058578172|  -118.3960444204|         1| 96.84797621031923|     33.9067250829|   -118

60198

# Load aggregated data

In [None]:
avg_velocities = spark\
    .read\
    .json(AGGREGATED_DATA_DIR + "avg_velocities.json")\

avg_velocities.cache()
avg_velocities.printSchema()
avg_velocities.show(10)
avg_velocities.count()

# Consume data from Kafka & process

In [None]:
# Define schema
schema = StructType([
    StructField("route_id", StringType(), True),
    StructField("id", StringType(), True),
    StructField("run_id", StringType(), True),
    StructField("predictable", BooleanType(), True),
    StructField("seconds_since_report", LongType(), True),
    StructField("heading", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

# Subscribe to 1 topic defaults to the earliest and latest offsets
bus_positions = spark \
    .read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "buses-location") \
    .load()\
    .withColumn('key', df.key.cast(StringType()))\
    .withColumn('value', df.value.cast(StringType()))\
    .select(functions.from_json("value", schema).alias("bus_location"))\
    .select("bus_location.*")

# calculate timestamp of record as: req_time - seconds_since_report, then convert it to timezone('America/Los_Angeles')
bus_positions = bus_positions\
    .withColumn("timestamp", col("req_time") - col("seconds_since_report"))\
    .select(col("id").alias("bus_id"), "latitude", "longitude", "route_id", substring("run_id", -1, 1).alias("direction"), "timestamp")\

# add day_type field, 0 if is weekday, 1 if is weekend or holiday
bus_positions = bus_positions\
    .withColumn("day_type", get_day_type("timestamp"))

# add timebin field (each timebin is 10 minutes, so there will be 144 timebins a day)
bus_positions = bus_positions\
    .withColumn("time_bin", get_time_bin("timestamp"))

# get segment id of bus positions
bus_positions = bus_positions\
    .join(routes_segments, "route_id")\
    .withColumn("bus_segment_distance", haversine_meter("latitude", "longitude", "segment_first_lat", "segment_first_lon") + haversine_meter("latitude", "longitude", "segment_second_lat", "segment_second_lon") - col("segment_len_meter"))\
    .withColumn("row_number", row_number().over(Window.partitionBy("bus_id", "route_id", "latitude", "longitude", "direction").orderBy(asc("bus_segment_distance"))))\
    .where("row_number = 1")\
    .drop("row_number")\
    .orderBy("bus_id", "route_id", "timestamp")

# drop rows with at least one null column
bus_positions = bus_positions.dropna("any")

bus_positions.printSchema()

root
 |-- req_time: long (nullable = true)
 |-- heading: double (nullable = true)
 |-- id: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- predictable: boolean (nullable = true)
 |-- route_id: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- seconds_since_report: long (nullable = true)



In [None]:
# get velocity of bus
bus_velocities = bus_positions
    .join(avg_velocities, "route_id", "segment_id", "day_type", "time_bin", "direction")\
    
bus_velocities.show(100)