In [1]:
CHECKPOINT_DIR = "hdfs://namenode:8020/spark/checkpoint"

DAY_TYPE_WEEKDAY = 0
DAY_TYPE_WEEKEND = 1
DATA_ACTUAL_TIMEZONE = "America/Los_Angeles"

BOOTSTRAP_SERVER = "kafka:29092"
TOPIC = "buses-location"

POSTGRES_URL = "jdbc:postgresql://timescaledb:5432/lametro"
POSTGRES_TABLE_BUS_VELOCITY = "bus_velocity"
POSTGRES_TABLE_BUS_ARRIVAL = "bus_arrival"
POSTGRES_USERNAME = "postgres"
POSTGRES_PASSWORD = "8zr7E3SV"

REDIS_HOST = "redis"
REDIS_PORT = "6379"
REDIS_PASSWORD = "8zr7E3SV"

STATIC_DATA_DIR = "hdfs://namenode:8020/ola/static_data/"
HISTORICAL_DATA_DIR = "hdfs://namenode:8020/ola/historical_data/"
AGGREGATED_DATA_DIR = "hdfs://namenode:8020/ola/aggregated_data/"
TEMP_DIR = "hdfs://namenode:8020/temp"

LOCAL_STATIC_DATA_DIR = "/home/data/static_data/"
LOCAL_HISTORICAL_DATA_DIR = "/home/data/historical_data/"
LOCAL_AGGREGATED_DATA_DIR = "/home/data/aggregated_data/"
LOCAL_TEMP_DIR = "/home/data/temp"

In [2]:
import os
import sys
import re

from pyspark.sql import SparkSession, SQLContext
from pyspark import  SparkContext, SparkConf
from pyspark.streaming import StreamingContext

# Spark session & context
conf = SparkConf()
conf.setMaster("spark://0.0.0.0:7077").setAppName("hello-world")
conf.set("spark.cores.max", "1")
conf.set("spark.driver.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.executor.extraClassPath", "/usr/local/spark/third-party-jars/*")
conf.set("spark.sql.caseSensitive", "true")
conf.set("spark.ui.port", "4040")
conf.set("spark.redis.host", REDIS_HOST)
conf.set("spark.redis.port", REDIS_PORT)
conf.set("spark.redis.auth", REDIS_PASSWORD)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 1)

In [3]:
sc.version

'3.1.1'

In [4]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
.container { 
    width:95% !important; 
}
</style>

In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

from datetime import datetime
from pytz import timezone
import math

# Read Kafka

In [6]:
# Subscribe to 1 topic defaults to the earliest and latest offsets
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:29092") \
  .option("subscribe", "buses-location") \
  .load()

df = df\
    .withColumn('key', df.key.cast(StringType()))\
    .withColumn('value', df.value.cast(StringType()))\

df.show(1)

+----+--------------------+--------------+---------+------+--------------------+-------------+
| key|               value|         topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------+---------+------+--------------------+-------------+
|4132|{"route_id":"854"...|buses-location|        5|     0|2021-07-03 11:19:...|            0|
+----+--------------------+--------------+---------+------+--------------------+-------------+
only showing top 1 row



In [7]:
schema = StructType([
    StructField("route_id", StringType(), True),
    StructField("id", StringType(), True),
    StructField("run_id", StringType(), True),
    StructField("predictable", BooleanType(), True),
    StructField("seconds_since_report", LongType(), True),
    StructField("heading", DoubleType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True)
])

In [8]:
df_1 = df\
    .select(col("timestamp").cast("long"), from_json("value", schema).alias("bus_location"))\
    .select("timestamp", "bus_location.*")

df_1.show(1, False)
df_1.printSchema()

+----------+--------+----+--------+-----------+--------------------+-------+---------+-----------+
|timestamp |route_id|id  |run_id  |predictable|seconds_since_report|heading|latitude |longitude  |
+----------+--------+----+--------+-----------+--------------------+-------+---------+-----------+
|1625311195|854     |4132|854_34_1|true       |23                  |281.0  |34.054241|-118.233482|
+----------+--------+----+--------+-----------+--------------------+-------+---------+-----------+
only showing top 1 row

root
 |-- timestamp: long (nullable = true)
 |-- route_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- run_id: string (nullable = true)
 |-- predictable: boolean (nullable = true)
 |-- seconds_since_report: long (nullable = true)
 |-- heading: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [9]:
# df_1\
#     .groupBy("")
#     .write\
#     .format("org.apache.spark.sql.redis")\
#     .option("key.column", "route_id")\
#     .save()

In [14]:
combine_maps = udf(
    lambda maps: {key:f[key] for f in maps for key in f},
    MapType(
        StringType(),
        StringType()
    )
)

df_2 = df_1\
    .limit(100)\
    .select("route_id", create_map(concat_ws("_", "id", "timestamp").alias("id"), (col("timestamp") - col("seconds_since_report")).cast("string").alias("report_time")).alias("bus"))\
    .groupBy("route_id")\
    .agg(combine_maps(collect_list("bus")).alias("buses"))

In [15]:
sc._jvm.com.github.lhvubtqn.spark.Utils.toRedisHashes(sc._jsc, df_2._jdf)