In [0]:
displayHTML("""<center><font size="8" color="#39ac73" face="sans-serif">AttractMe - Dublin</font></center> """)

d *Dear User,<br />
Please choose your origin bus stop in one of the options above.<br /><br />
For Stream Sources, enter your api in the "API" option.<br />
For Batch Sources, enter your json path in the "Json path" option.<br />
For a single source, choose one of the bus stops options presented in the "Source Bus Stop" option.*

In [0]:
from pyspark.sql.types import *
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.functions import col, split, randn
import pandas as pd
import numpy as np
from json import dumps
import json
import pickle
import pyspark.sql.functions as F
from collections import defaultdict

ES_HOST= '10.0.0.20'  #please change the ES_HOST to your VM server
es = Elasticsearch([{'host': ES_HOST}], timeout=60000)

In [0]:
schema_structfields = [
                      StructField("_id", MapType(StringType(), StringType(), True),True), 
                       StructField("actualDelay",LongType(),True),
                       StructField("angle",DoubleType(),True),
                       StructField("anomaly",BooleanType(),True),
                       StructField("areaId",LongType(),True),
                       StructField("areaId1",LongType(),True),
                       StructField("areaId2",LongType(),True),
                       StructField("areaId3",LongType(),True),
                       StructField("atStop",BooleanType(),True),
                       StructField("busStop",LongType(),True),
                       StructField("calendar",MapType(StringType(), StringType(),True),True),
                       StructField("congestion",BooleanType(),True),
                       StructField("currentHour",LongType(),True),
                       StructField("dateType",LongType(),True),
                       StructField("dateTypeEnum",StringType(),True),
                       StructField("delay",LongType(),True),
                       StructField("direction",LongType(),True),
                       StructField("distanceCovered",DoubleType(),True),
                       StructField("ellapsedTime",LongType(),True),
                       StructField("filteredActualDelay",LongType(),True),
                       StructField("gridID",StringType(),True),
                       StructField("journeyPatternId",StringType(),True), 
                       StructField("justLeftStop",BooleanType(),True),
                       StructField("justStopped",BooleanType(),True),
                       StructField("latitude",DoubleType(),True), 
                       StructField("lineId",StringType(),True), 
                       StructField('loc',StructType([StructField('coordinates',ArrayType(DoubleType(),True),True),StructField('type',StringType(),True)]),True),
                       StructField("longitude",DoubleType(),True),  
                       StructField("poiId",LongType(),True),
                       StructField("poiId2",LongType(),True),
                       StructField("probability",DoubleType(),True),
                       StructField("systemTimestamp",DoubleType(),True),
                       StructField("timestamp",MapType(StringType(),StringType(),True)), 
                       StructField("vehicleId",LongType(),True),
                       StructField("vehicleSpeed",LongType(),True)]

schema = StructType(schema_structfields)

In [0]:
def read_stream_data(api,schema):
  # kafka_server = '10.0.0.30:9091'
  kafka_server = api


  # Subscribe to a pattern
  kafka_raw_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribePattern", "vehicleId_.*") \
    .option("startingOffsets", "earliest").load()

  kafka_value_df = kafka_raw_df.selectExpr("CAST(value AS STRING)")

  # schema = pickle.load(open("/dbfs/mnt/schema.pkl", "rb"))

  stream_raw_df = kafka_value_df \
             .select(F.from_json(F.col("value"), schema=schema).alias('json')) \
             .select("json.*")

  return stream_raw_df

In [0]:
def read_static_data(path,schema):
  raw_df = spark.read.json(path, schema=schema)
  return raw_df

In [0]:
def fix_drop_cols_from_df(df):
  df = df.withColumn('line_num', df['journeyPatternId'][0:4])
  df = df.withColumn('line_num', F.regexp_replace('line_num', '^0+', '')) 
  df = df.withColumn('direction', df['journeyPatternId'][5:5][0:1])
  df = df.withColumn('mins_delay', df.delay / 60)
  df = df.withColumnRenamed('busStop','OriginStop')

  df = df.withColumn('timestamp', F.map_values(df.timestamp)[0])
  df = df.withColumn('timestamp', F.to_timestamp(F.from_unixtime(df.timestamp / (1000))))

  input_df=df["line_num","mins_delay","OriginStop","timestamp"]
  return input_df

In [0]:
# def write_stream_to_elastic(df_to_elastic, index, settings_with_mapping, ES_HOST,checkpoints_location):
#   if not es.indices.exists(index):
#     es.indices.create(index=index, ignore=400, body=settings_with_mapping)
#   df_to_elastic.writeStream \
#       .outputMode("append") \
#       .queryName(f"{index}_to_es") \
#       .format("org.elasticsearch.spark.sql") \
#       .option("es.nodes.wan.only","true") \
#       .option("checkpointLocation", checkpoints_location) \
#       .option("es.resource", index) \
#       .option("es.nodes", ES_HOST) \
#       .option("es.port","9200") \
#       .start()

  
# index="lines_delay_time"
# # es.indices.delete(index=index)
# lines_delay_time_schema  = {
#     "settings": {
#         "number_of_shards": 1,
#         "number_of_replicas": 0,
#         "refresh_interval" : -1
#     },
#      "mappings": {
#       "properties": {
#           "mins_delay" : { "type": "long" },#*
#           "busStop" : { "type": "long" },#*
#           "timestamp" : { "type": "date"}, 
#           "line_num" : { 'type' : 'keyword'},#*
#       }
#      }
# }
# write_stream_to_elastic(input_stream_df, index, lines_delay_time_schema, ES_HOST,"/tmp/almog.gueta/streaming")


In [0]:
lines_att_df=spark.read.csv('atrractions_with_busStop.csv',header=True)
lines_att_pd=lines_att_df.toPandas()
good_sources=list(set(lines_att_pd["sourceStop"]))
good_sources.sort()

In [0]:
def main_app(input_stream_df,source_bus_stop):
  q={
    "aggs": {
      "all_lines": {"terms": {"field": "line_num","size":300 },
        "aggs": {"avg_delay": {"top_metrics": { "metrics": {"field": "mins_delay"},"sort": {"timestamp": "desc"} }}}}}
  }
  lines_delay = es.search(index='lines_delay_time', body=q)
  lines = [line['key'] for line in lines_delay['aggregations']['all_lines']['buckets']]
  delays = [_delay["avg_delay"]["top"][0]['metrics']["mins_delay"] for _delay in lines_delay['aggregations']['all_lines']['buckets']]
  lines_delay_df = pd.DataFrame({"line_num":lines,"last_delay":delays})
  def play_app(source):
    optional_atts=lines_att_pd[lines_att_pd.sourceStop==str(source)]#.filter(lines_att_df.sourceStop==str(source))
    num_options=optional_atts.head(2)
    if len(num_options)==0:
      return "We are sorry, there is no recommended attraction that can be reached from your location with a single bus ride"
    elif len(num_options)==1:
      output_list = optional_atts.to_dict("records")
      return f"Take line number {output_list[0]['line_num']}, to bus stop number {output_list[0]['destStop']} and enjoy the '{output_list[0]['attraction']}' attraction" 
    else:
      joined=pd.merge(optional_atts,lines_delay_df,how="left",on=["line_num"]).sort_values(by=['last_delay'])
      selected=joined.iloc[0]
      return f"Take line number {selected['line_num']}, to bus stop number {selected['destStop']} and enjoy the '{selected['attraction']}' attraction"
  if source_bus_stop:
    return play_app(source_bus_stop)
  appUDF = F.udf(lambda source:play_app(source))   
  return input_stream_df.withColumn('Output', appUDF(F.col('OriginStop')))

In [0]:
API=dbutils.widgets.get("API")
Path=dbutils.widgets.get("Json path")
Source=dbutils.widgets.get("Source Bus Stop")

api_choosed=int(API!="Please enter your API here" )
path_choosed=int(Path!="Please enter your path here")
source_choosed=int(Source!="Choose source")
sum_cond=api_choosed+path_choosed+source_choosed
not_finished = True


In [0]:
if sum_cond!=1:
  print("Please Choose only one option")

elif api_choosed:
  stream_raw_df=read_stream_data(API,schema)
  indput_df=fix_drop_cols_from_df(stream_raw_df)
  play_df=main_app(indput_df,None)   
  play_df=play_df["OriginStop","Output"]
#   play_df=play_df.sort('Output')
  display(play_df)
  not_finished = False

elif path_choosed:
  raw_df=read_static_data(Path,schema)
  input_df=fix_drop_cols_from_df(raw_df)
  play_df=main_app(input_df,None)   
  not_finished = False
  display(play_df["OriginStop","Output"])

elif source_choosed:
#   print(main_app(None,Source)) 
  play_df=pd.DataFrame({'OriginStop': [Source],'Output': [main_app(None,Source)]})
  not_finished = False
  display(play_df)


OriginStop,Output
1013,"Take line number 142, to bus stop number 1015 and enjoy the 'iveagh gardens' attraction"


In [0]:
while not_finished:
  a=2
dbutils.widgets.removeAll()

In [0]:
while not_finished:
  a=2
dbutils.widgets.dropdown("Source Bus Stop", "Choose source",["Choose source"]+[x for x in good_sources])
dbutils.widgets.text("API", "Please enter your API here")
dbutils.widgets.text("Json path", "Please enter your path here")


