In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.functions import col, split, randn
from pyspark.ml import Pipeline
import pandas as pd 
from pyspark.sql.window import Window
from collections import defaultdict

In [0]:
# link for working with dict of dicts (primary and secondary keys- http://python.omics.wiki/data-structures/dictionary/multiple-keys 

In [0]:
schema = StructType([
                    StructField("ID",LongType(),True),
                    StructField("ShapeId",StringType(),True),
                    StructField("Operator",StringType(),True),
                    StructField("StopSequence",LongType(),True),
                    StructField("RouteName",StringType(),True),
                    StructField("Direction",StringType(),True),
                    StructField("PlateCode",LongType(),True),
                    StructField("RouteData",StringType() ,True)])   


df = spark.read.csv('/data/lines_and_stops.csv', header=True, schema=schema)
df = df[['StopSequence', 'RouteName', "Direction", "PlateCode", "RouteData"]]
df = df.withColumnRenamed('RouteName', 'lineNum').withColumnRenamed('PlateCode', 'busStop').withColumnRenamed('RouteData', 'allLines')
df = df.dropDuplicates().sort('lineNum', 'StopSequence')
df = df.na.drop(how="any")
df = df[df.busStop != 0]
# display(df)

In [0]:
stops_per_line = df.groupBy("lineNum", "Direction").agg(F.collect_list("busStop")).sort(F.col("lineNum")).withColumnRenamed('collect_list(busStop)', 'allStops')
# display(stops_per_line)

In [0]:
# display(stops_per_line[stops_per_line.lineNum==16])

In [0]:
src_dest_lines = defaultdict(dict)
for row in stops_per_line.rdd.collect():
  line = row.lineNum 
  direction = row.Direction 
  allStops = row.allStops
  for src, dest in zip(allStops, allStops[1:]):
    if not (( src in src_dest_lines.keys() ) & ( dest in src_dest_lines[src].keys() )): 
      src_dest_lines[src][dest] = set()
    src_dest_lines[src][dest].add(tuple((line, direction)))

In [0]:
# for key, value in src_dest_lines.items(): 
#   print(key, value) 

In [0]:
schema = StructType([
                    StructField("loc_name",StringType(),True),
                    StructField("rating_name",StringType(),True),
                    StructField("category",StringType(),True),
                    StructField("Longitude",DoubleType(),True),
                    StructField("Latitude",DoubleType(),True),
                    StructField("agg_score",DoubleType(),True),
                    StructField("rating",DoubleType(),True)])

attractions_df = spark.read.csv('/data/rating_loc_attractions.csv', header=True)#, schema=schema)

attractions_df = attractions_df[['loc_name', "Longitude", 'Latitude','rating','category']]
attractions_with_rankings=attractions_df.withColumnRenamed('loc_name','name')
# display(attractions_with_rankings) 

In [0]:
stops_df = spark.read.csv('/data/stops.txt',header=True)
stops_df=stops_df.withColumn("busStop", split(F.col("stop_name"), ",")[1])
stops_df=stops_df.withColumn("busStop", split(F.col("busStop"), " "))
stops_df = stops_df.filter(F.col("busStop")[2] != "No.")
stops_df=stops_df.withColumn("busStop",F.col("busStop")[2])
stops_df=stops_df[["stop_lat","stop_lon","busStop"]]
stops_df = stops_df.withColumnRenamed('stop_lat', 'latitude').withColumnRenamed('stop_lon', 'longitude')
# display(stops_df)

In [0]:
stops_df.createOrReplaceTempView('bus_stops_names')
attractions_with_rankings.createOrReplaceTempView('attractions_rank_df')
attractions_with_busStops=sqlContext.sql("select A.name, A.Longitude, A.Latitude,A.rating, A.category,B.busStop, abs(A.Longitude-B.longitude) as long_dist, abs(A.Latitude-B.latitude) as lat_dist from attractions_rank_df as A left join bus_stops_names as B on abs(A.Longitude-B.longitude)<=0.00700 and abs(A.Latitude-B.latitude)<=0.00700")

#calc distance and keep 3 nearest stops
attractions_with_busStops=attractions_with_busStops.withColumn("distance",F.sqrt(F.pow(F.col("long_dist"),2)+F.pow(F.col("lat_dist"),2)))

window = Window.partitionBy(attractions_with_busStops['Name']).orderBy(attractions_with_busStops["distance"])
closest_stops_df=attractions_with_busStops.select('*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= 3)
closest_stops_df=closest_stops_df["name","rating","category","busStop"]

In [0]:
closest_stops_df=closest_stops_df.dropna()

In [0]:
# display(closest_stops_df)

In [0]:
att_dict = defaultdict(list)
for row in closest_stops_df.rdd.collect():
  dest = row.busStop
  attraction = row.name 
  rank = row.rating
  category = row.category
  att_dict[dest].append((attraction,rank,category))
  
# Initializing N  
N = 2
for key, value in att_dict.items():
  res = sorted(value, key = lambda x: x[1], reverse = True)[:N] 
  att_dict[key]=res

In [0]:
# for key, value in att_dict.items(): 
#   print(key, value) 

In [0]:
# for key, value in src_dest_lines.items(): 
#   print(type(key), type(value.keys())) 

In [0]:
table_ard = defaultdict(dict) 
for S in src_dest_lines.keys():
  temp = {}
  for D in src_dest_lines[S].keys():
    if att_dict.get(str(D)):
      for attraction in att_dict[str(D)]:
        a,r,c=attraction
        key = (a,r,D,c)
        temp[key] = list(src_dest_lines[S][D]) # this value is list of lines 
  res = {key: temp[key] for key in sorted(temp.keys(), key=lambda ele: ele[1])}
  if res:
    table_ard[S] =  res


In [0]:
# for key, value in table_ard.items(): 
#   print(key, value) 

In [0]:
Ready =  defaultdict(dict) 
for S in table_ard: 
  for key, value in table_ard[S].items(): 	# key=(a1, r1, d1), value=[lines ] #check if the keys are still sorted by rank. If not- think again 
    for line in value:
      if line not in Ready.keys():
        Ready[S][line[0]] = (key[0], key[3],key[2]) 


In [0]:
Ready_values=list(Ready.values())
Ready_keys=list(Ready.keys())

sourceStop=[source  for source, inner_dict in zip(Ready_keys,Ready_values) for line in list(inner_dict.keys())]
line_nums=[line  for inner_dict in Ready_values for line in list(inner_dict.keys())]

attractions=[data[0]  for inner_dict in Ready_values for data in list(inner_dict.values())]
catagorys=[data[1]  for inner_dict in Ready_values for data in list(inner_dict.values())]
dests=[data[2]  for inner_dict in Ready_values for data in list(inner_dict.values())]


In [0]:
data = {'sourceStop':  sourceStop,
        'line_num': line_nums,
        'attraction': attractions,
       "catagory":catagorys,
       "destStop":dests}
atrractions_final_df = pd.DataFrame (data, columns = ['sourceStop','line_num','attraction',"catagory","destStop"])
atrractions_final_df=spark.createDataFrame(atrractions_final_df)
# display(atrractions_final_df)

In [0]:
atrractions_final_df.write.csv('atrractions_with_busStop.csv',header=True)

In [0]:
df3=spark.read.csv('atrractions_with_busStop.csv',header=True)
# display(df3)