In [0]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.functions import col, split, randn
from pyspark.ml import Pipeline
import pandas as pd 
from pyspark.sql.window import Window
from collections import defaultdict

In [0]:
# link for working with dict of dicts (primary and secondary keys- http://python.omics.wiki/data-structures/dictionary/multiple-keys 

In [0]:
schema = StructType([
                    StructField("ID",LongType(),True),
                    StructField("ShapeId",StringType(),True),
                    StructField("Operator",StringType(),True),
                    StructField("StopSequence",LongType(),True),
                    StructField("RouteName",StringType(),True),
                    StructField("Direction",StringType(),True),
                    StructField("PlateCode",LongType(),True),
                    StructField("RouteData",StringType() ,True)])   


df = spark.read.csv('/FileStore/tables/lines_and_stops.csv', header=True, schema=schema)
df = df[['StopSequence', 'RouteName', "Direction", "PlateCode", "RouteData"]]
df = df.withColumnRenamed('RouteName', 'lineNum').withColumnRenamed('PlateCode', 'busStop').withColumnRenamed('RouteData', 'allLines')
df = df.dropDuplicates().sort('lineNum', 'StopSequence')
df = df.na.drop(how="any")
df = df[df.busStop != 0]
# display(df)

StopSequence,lineNum,Direction,busStop,allLines
1,1,Inbound,381,147
1,1,Outbound,226,1104
2,1,Outbound,228,1104
2,1,Inbound,382,147
3,1,Outbound,229,1104
3,1,Inbound,4451,147
4,1,Inbound,383,147
4,1,Outbound,227,1104
5,1,Inbound,384,147
5,1,Outbound,230,1104


In [0]:
stops_per_line = df.groupBy("lineNum", "Direction").agg(F.collect_list("busStop")).sort(F.col("lineNum")).withColumnRenamed('collect_list(busStop)', 'allStops')
# display(stops_per_line)

In [0]:
# display(stops_per_line[stops_per_line.lineNum==16])

lineNum,Direction,allStops
16,Inbound,"List(5171, 2976, 2977, 2978, 2979, 2980, 2981, 2991, 2992, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 7293, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1347, 1348, 1349, 1350, 1352, 1353, 1355, 1357, 1359, 320, 278, 8, 12, 14, 15, 17, 18, 19, 21, 7602, 85, 203, 204, 205, 215, 216, 217, 218, 219, 220, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 7347)"
16,Outbound,"List(7347, 208, 3669, 209, 7349, 210, 211, 1631, 1632, 212, 5053, 213, 1633, 214, 4432, 1634, 1635, 119, 1636, 44, 1637, 7603, 45, 1638, 1639, 46, 1640, 47, 48, 231, 49, 232, 233, 51, 52, 234, 262, 235, 270, 236, 336, 237, 213, 1279, 214, 1282, 4432, 4456, 119, 1284, 44, 1285, 7603, 45, 1287, 46, 1288, 47, 1289, 1290, 48, 1291, 49, 1292, 51, 1293, 52, 262, 1294, 270, 1295, 1296, 336, 1297, 1279, 1298, 1282, 1299, 4456, 1284, 1300, 1301, 1285, 1302, 1287, 1303, 1304, 1288, 1305, 1289, 1320, 1290, 1291, 1321, 1322, 1292, 1293, 1323, 1324, 1294, 7068, 1295, 1296, 2966, 2967, 1297, 1298, 2968, 2969, 1299, 1300, 2970, 1301, 2971, 1302, 5051, 5052, 1303, 1304, 2975, 1305, 1320, 1321, 1322, 1323, 1324, 7068, 2966, 2967, 2968, 2969, 2970, 2971, 5051, 5052, 2975)"


In [0]:
src_dest_lines = defaultdict(dict)
for row in stops_per_line.rdd.collect():
  line = row.lineNum 
  direction = row.Direction 
  allStops = row.allStops
  for src, dest in zip(allStops, allStops[1:]):
    if not (( src in src_dest_lines.keys() ) & ( dest in src_dest_lines[src].keys() )): 
      src_dest_lines[src][dest] = set()
    src_dest_lines[src][dest].add(tuple((line, direction)))

In [0]:
# for key, value in src_dest_lines.items(): 
#   print(key, value) 

In [0]:
schema = StructType([
                    StructField("loc_name",StringType(),True),
                    StructField("rating_name",StringType(),True),
                    StructField("category",StringType(),True),
                    StructField("Longitude",DoubleType(),True),
                    StructField("Latitude",DoubleType(),True),
                    StructField("agg_score",DoubleType(),True),
                    StructField("rating",DoubleType(),True)])

attractions_df = spark.read.csv('/FileStore/tables/rating_loc_attractions.csv', header=True)#, schema=schema)

attractions_df = attractions_df[['loc_name', "Longitude", 'Latitude','rating','category']]
attractions_with_rankings=attractions_df.withColumnRenamed('loc_name','name')
# display(attractions_with_rankings) 

name,Longitude,Latitude,rating,category
macau casino club,-6.2656956,53.3411714,506,CASINOS & GAMBLING
bull island,-6.137493,53.374989,74,NATURE & PARKS
dublin castle,-6.26741864418033,53.3428995946208,43,SIGHTS & LANDMARKS
dublin city hall,-6.2671491693115495,53.3439224313735,82,MUSEUMS
civic theatre,-6.372209,53.288411,161,CONCERTS & SHOWS
contemporary music centre,-6.269532,53.344173,569,CONCERTS & SHOWS
dublin zoo,-6.304145,53.355024,16,ZOOS & AQUARIUMS
dublinia,-6.27200028587913,53.34312274391829,33,MUSEUMS
gaiety theatre,-6.26163,53.340387,34,CONCERTS & SHOWS
irish jewish museum,-6.269414,53.330748,106,MUSEUMS


In [0]:
stops_df = spark.read.csv('/FileStore/tables/stops_csv.txt',header=True)
stops_df=stops_df.withColumn("busStop", split(F.col("stop_name"), ",")[1])
stops_df=stops_df.withColumn("busStop", split(F.col("busStop"), " "))
stops_df = stops_df.filter(F.col("busStop")[2] != "No.")
stops_df=stops_df.withColumn("busStop",F.col("busStop")[2])
stops_df=stops_df[["stop_lat","stop_lon","busStop"]]
stops_df = stops_df.withColumnRenamed('stop_lat', 'latitude').withColumnRenamed('stop_lon', 'longitude')
# display(stops_df)

latitude,longitude,busStop
53.3522443611407,-6.26372321891882,2
53.3523085514349,-6.26381074216821,3
53.3525745131874,-6.26417548603793,4
53.352749335707,-6.26445380396429,6
53.3528409105808,-6.26457026121744,7
53.3532722714846,-6.26518367657395,8
53.3533921036497,-6.26538912389949,10
53.3568152335319,-6.26467919507521,11
53.3571075162046,-6.26438186733525,12
53.3585312378782,-6.26277650570866,14


In [0]:
stops_df.createOrReplaceTempView('bus_stops_names')
attractions_with_rankings.createOrReplaceTempView('attractions_rank_df')
attractions_with_busStops=sqlContext.sql("select A.name, A.Longitude, A.Latitude,A.rating, A.category,B.busStop, abs(A.Longitude-B.longitude) as long_dist, abs(A.Latitude-B.latitude) as lat_dist from attractions_rank_df as A left join bus_stops_names as B on abs(A.Longitude-B.longitude)<=0.00700 and abs(A.Latitude-B.latitude)<=0.00700")

#calc distance and keep 3 nearest stops
attractions_with_busStops=attractions_with_busStops.withColumn("distance",F.sqrt(F.pow(F.col("long_dist"),2)+F.pow(F.col("lat_dist"),2)))

window = Window.partitionBy(attractions_with_busStops['Name']).orderBy(attractions_with_busStops["distance"])
closest_stops_df=attractions_with_busStops.select('*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= 3)
closest_stops_df=closest_stops_df["name","rating","category","busStop"]

In [0]:
closest_stops_df=closest_stops_df.dropna()

In [0]:
# display(closest_stops_df)

name,rating,category,busStop
meeting place,319,SIGHTS & LANDMARKS,7622
meeting place,319,SIGHTS & LANDMARKS,313
meeting place,319,SIGHTS & LANDMARKS,329
ha penny laugh,344,SHOPPING,7622
ha penny laugh,344,SHOPPING,1358
ha penny laugh,344,SHOPPING,329
leisureplex blanchardstown,259,SHOPPING,7025
leisureplex blanchardstown,259,SHOPPING,2959
leisureplex blanchardstown,259,SHOPPING,4747
national print museum,84,MUSEUMS,413


In [0]:
att_dict = defaultdict(list)
for row in closest_stops_df.rdd.collect():
  dest = row.busStop
  attraction = row.name 
  rank = row.rating
  category = row.category
  att_dict[dest].append((attraction,rank,category))
  
# Initializing N  
N = 2
for key, value in att_dict.items():
  res = sorted(value, key = lambda x: x[1], reverse = True)[:N] 
  att_dict[key]=res

In [0]:
# for key, value in att_dict.items(): 
#   print(key, value) 

In [0]:
# for key, value in src_dest_lines.items(): 
#   print(type(key), type(value.keys())) 

In [0]:
table_ard = defaultdict(dict) 
for S in src_dest_lines.keys():
  temp = {}
  for D in src_dest_lines[S].keys():
    if att_dict.get(str(D)):
      for attraction in att_dict[str(D)]:
        a,r,c=attraction
        key = (a,r,D,c)
        temp[key] = list(src_dest_lines[S][D]) # this value is list of lines 
  res = {key: temp[key] for key in sorted(temp.keys(), key=lambda ele: ele[1])}
  if res:
    table_ard[S] =  res


In [0]:
# for key, value in table_ard.items(): 
#   print(key, value) 

In [0]:
Ready =  defaultdict(dict) 
for S in table_ard: 
  for key, value in table_ard[S].items(): 	# key=(a1, r1, d1), value=[lines ] #check if the keys are still sorted by rank. If not- think again 
    for line in value:
      if line not in Ready.keys():
        Ready[S][line[0]] = (key[0], key[3],key[2]) 


In [0]:
Ready_values=list(Ready.values())
Ready_keys=list(Ready.keys())

sourceStop=[source  for source, inner_dict in zip(Ready_keys,Ready_values) for line in list(inner_dict.keys())]
line_nums=[line  for inner_dict in Ready_values for line in list(inner_dict.keys())]

attractions=[data[0]  for inner_dict in Ready_values for data in list(inner_dict.values())]
catagorys=[data[1]  for inner_dict in Ready_values for data in list(inner_dict.values())]
dests=[data[2]  for inner_dict in Ready_values for data in list(inner_dict.values())]


In [0]:
data = {'sourceStop':  sourceStop,
        'line_num': line_nums,
        'attraction': attractions,
       "catagory":catagorys,
       "destStop":dests}
atrractions_final_df = pd.DataFrame (data, columns = ['sourceStop','line_num','attraction',"catagory","destStop"])
atrractions_final_df=spark.createDataFrame(atrractions_final_df)
# display(atrractions_final_df)

sourceStop,line_num,attraction,catagory,destStop
371,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
391,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,392
391,1,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,392
392,1,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
396,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
397,833,jeanie johnston,MUSEUMS,7397
397,1,dublin city archives,MUSEUMS,398
397,56a,dublin city archives,MUSEUMS,398
397,47,trinity college dublin,SIGHTS & LANDMARKS,400
397,15d,dublin city archives,MUSEUMS,398


In [0]:
atrractions_final_df.write.csv('atrractions_with_busStop.csv',header=True)

In [0]:
df3=spark.read.csv('atrractions_with_busStop.csv',header=True)

In [0]:
# display(df3)

sourceStop,line_num,attraction,catagory,destStop
371,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
391,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,392
391,1,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,392
392,1,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
396,47,shelbourne park greyhound stadium,SIGHTS & LANDMARKS,395
397,833,jeanie johnston,MUSEUMS,7397
397,1,dublin city archives,MUSEUMS,398
397,56a,dublin city archives,MUSEUMS,398
397,47,trinity college dublin,SIGHTS & LANDMARKS,400
397,15d,dublin city archives,MUSEUMS,398
