In [0]:
import pyspark.sql.functions as F
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, randn
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
from pyspark.sql.window import Window

In [0]:
schema = StructType([
                    StructField("line_num",StringType(),True),
                    StructField("sourceAndDest",StringType(),True)])

lines_df = spark.read.csv('/data/bus_lines_source_destination.csv', header=True, schema=schema)

attralines_dfctions_df = lines_df.dropDuplicates()

display(lines_df) 

line_num,sourceAndDest
122,From Ashington To Drimnagh Rd.
70n,From Westmoreland St. To Dunboyne
33n,From Westmoreland St. To Balbriggan
88n,From Westmoreland St. To Ashbourne
1,From Santry (Shanard Rd.) To Sandymount (St. John's Church)
46a,From Phoenix Park To Dún Laoghaire
41c,From Lower Abbey St. To Swords Manor
155,From Ikea To Bray Rail Station
39n,From Westmoreland St. To Tyrrelstown
66n,From Westmoreland St. To Leixlip (Louisa Bridge) via Glen Easton


In [0]:
##Add source and dest cols 
lines_df = lines_df.withColumn("source", split(F.col("sourceAndDest"), "To")[0])
lines_df = lines_df.withColumn("source", split(F.col("source"), "From")[1]) 
lines_df = lines_df.withColumn("dest", split(F.col("sourceAndDest"), "To ")[1]) 
lines_df = lines_df.withColumn("line_num", F.upper(F.col("line_num")))
lines_df = lines_df.withColumn('direction', F.lit(0))
#display(lines_df) 

##duplicate df to change source-dest order (using direction)
other_direction_df = lines_df.withColumn('direction', F.lit(1))
other_direction_df = other_direction_df.withColumnRenamed('source', 'new_dest').withColumnRenamed('dest', 'new_source').withColumnRenamed('new_source', 'source').withColumnRenamed('new_dest', 'dest').select("line_num","sourceAndDest","source","dest", 'direction')

##Concat both source-dest directions 
other_direction_df = lines_df.withColumn('direction', F.lit(1))
other_direction_df = other_direction_df.withColumnRenamed('source', 'new_dest').withColumnRenamed('dest', 'new_source').withColumnRenamed('new_source', 'source').withColumnRenamed('new_dest', 'dest').select("line_num","sourceAndDest","source","dest", 'direction')

#Concat both source-dest directions 
source_dest_df = lines_df.union(other_direction_df)

In [0]:
display(source_dest_df)

line_num,sourceAndDest,source,dest,direction
122,From Ashington To Drimnagh Rd.,Ashington,Drimnagh Rd.,0
70N,From Westmoreland St. To Dunboyne,Westmoreland St.,Dunboyne,0
33N,From Westmoreland St. To Balbriggan,Westmoreland St.,Balbriggan,0
88N,From Westmoreland St. To Ashbourne,Westmoreland St.,Ashbourne,0
1,From Santry (Shanard Rd.) To Sandymount (St. John's Church),Santry (Shanard Rd.),Sandymount (St. John's Church),0
46A,From Phoenix Park To Dún Laoghaire,Phoenix Park,Dún Laoghaire,0
41C,From Lower Abbey St. To Swords Manor,Lower Abbey St.,Swords Manor,0
155,From Ikea To Bray Rail Station,Ikea,Bray Rail Station,0
39N,From Westmoreland St. To Tyrrelstown,Westmoreland St.,Tyrrelstown,0
66N,From Westmoreland St. To Leixlip (Louisa Bridge) via Glen Easton,Westmoreland St.,Leixlip (Louisa Bridge) via Glen Easton,0


In [0]:
source_dest_df.write.csv('source_dest_df_1.csv',header=True)

In [0]:
stops_df = spark.read.csv('/data/stops.txt',header=True)
stops_df=stops_df.withColumn("busStop", split(F.col("stop_name"), ",")[1])
stops_df=stops_df.withColumn("busStop", split(F.col("busStop"), " ")[2])
stops_df=stops_df[["stop_lat","stop_lon","busStop"]]
stops_df = stops_df.withColumnRenamed('stop_lat', 'latitude').withColumnRenamed('stop_lon', 'longitude')
display(stops_df)

latitude,longitude,busStop
53.3522443611407,-6.26372321891882,2
53.3523085514349,-6.26381074216821,3
53.3525745131874,-6.26417548603793,4
53.352749335707,-6.26445380396429,6
53.3528409105808,-6.26457026121744,7
53.3532722714846,-6.26518367657395,8
53.3533921036497,-6.26538912389949,10
53.3568152335319,-6.26467919507521,11
53.3571075162046,-6.26438186733525,12
53.3585312378782,-6.26277650570866,14


In [0]:
schema = StructType([
                    StructField("Name",StringType(),True),
                    StructField("Url",StringType(),True),
                    StructField("Telephone",StringType(),True),
                    StructField("Longitude",DoubleType(),True),
                    StructField("Latitude",DoubleType(),True),
                    StructField("AddressRegion",StringType(),True),
                    StructField("AddressLocality",StringType(),True),
                    StructField("AddressCountry",StringType(),True)])

attractions_df = spark.read.csv('/data/Attractions.csv', header=True, schema=schema)

attractions_df = attractions_df.dropDuplicates()
display(attractions_df) 

Name,Url,Telephone,Longitude,Latitude,AddressRegion,AddressLocality,AddressCountry
Tuar Ard,,+353(0)906482042,-7.725291,53.396056,Westmeath,Moate,Republic of Ireland
Belvedere House Gardens & Park,http://www.belvedere-house.ie,+353(0)449338960,-7.369546,53.475943,Westmeath,Mullingar,Republic of Ireland
Jim Robinson Angling Centre,,+353(0)61453808,-8.636336,52.669096,Limerick,Limerick City,Republic of Ireland
New Ross Visitor Information Point,,+353(0)51425239,-6.94781327116379,52.3935487756114,Wexford,New Ross,Republic of Ireland
St. Mary's Collegiate Church,http://www.youghal.cork.anglican.org,+353(0)2481814,-7.853541,51.955064,Cork,Youghal,Republic of Ireland
Lough Donnell Annagheeragh River,,,-9.40937584215867,52.8456204222959,Clare,Milltown Malbay,Republic of Ireland
Garavogue River System,,+353(0)719161201,-8.47429233699108,54.2725133181227,Sligo,Sligo Town,Republic of Ireland
Ballyhoura Mountain Bike Trail Network,http://www.visitballyhoura.com,+353(0)6391300,-8.46955729370205,52.3599994277146,Limerick,Kilfinane,Republic of Ireland
Theatre Royal,http://www.theatreroyal.ie,+353(0)51874402,-7.107075,52.259856,Waterford,Waterford City,Republic of Ireland
Donegal Bay Waterbus,http://www.donegalbaywaterbus.com,+353(0)749723666,-8.111453,54.652789,Donegal,Donegal Town,Republic of Ireland


In [0]:
attractions_df = attractions_df.withColumn("Loc_A",F.concat_ws(",",attractions_df["Longitude"], attractions_df["Latitude"]))
attractions_df = attractions_df.withColumn("Loc_A", split(F.col("Loc_A"), ",\s*").cast(ArrayType(DoubleType())).alias("Loc_A"))
attractions_df = attractions_df[attractions_df.AddressRegion=='Dublin']
attractions_df = attractions_df.drop('AddressCountry','AddressRegion')
attractions_df = attractions_df.dropDuplicates(['Name'])
attractions_df = attractions_df.filter((attractions_df.Name != 'Keel Blue Flag Beach 2019') & (attractions_df.Name != 'Cliffs of Moher Tour - Dublin Tour Company'))
attractions_df=attractions_df["Name","Longitude","Latitude","Loc_A","AddressLocality"]
display(attractions_df)

Name,Longitude,Latitude,Loc_A,AddressLocality
12 Day Giant Irish Adventure Tour - Vagabond Tours of Ireland,-6.26020515750121,53.3493056292934,"List(-6.26020515750121, 53.3493056292934)",Dublin City
"Cork, Blarney Castle & Queenstown Tour -Railtours Ireland First Class!",-6.250291,53.351003,"List(-6.250291, 53.351003)",Dublin City
Dublin Whiskey Tours,-6.25742364309996,53.3456614891671,"List(-6.25742364309996, 53.3456614891671)",Dublin City
St. Mary's Pro Cathedral,-6.258885,53.351138,"List(-6.258885, 53.351138)",Dublin City
Authentic Ireland,-6.21599705967776,53.2755896475813,"List(-6.21599705967776, 53.2755896475813)",Sandyford
ISI - Ireland International Study Institute,-6.269322,53.347634,"List(-6.269322, 53.347634)",Dublin City
Malahide Castle and Howth Tour - DoDublin,-6.261029,53.350462,"List(-6.261029, 53.350462)",Dublin City
Irish Coaches 32CC,-6.258899,53.347894,"List(-6.258899, 53.347894)",Dublin City
James Clarence Mangan,-6.259664,53.338458,"List(-6.259664, 53.338458)",Dublin
Seapoint Blue Flag Beach 2019,-6.163408,53.297929,"List(-6.163408, 53.297929)",Seapoint


In [0]:
stops_df.createOrReplaceTempView('bus_stops_names')
attractions_df.createOrReplaceTempView('attractions_df')

In [0]:
attractions_with_busStops=sqlContext.sql("select A.Name, A.Longitude, A.Latitude,A.Loc_A,A.AddressLocality,B.busStop, abs(A.Longitude-B.longitude) as long_dist, abs(A.Latitude-B.latitude) as lat_dist from attractions_df as A left join bus_stops_names as B on abs(A.Longitude-B.longitude)<=0.00120 and abs(A.Latitude-B.latitude)<=0.00120")

In [0]:
attractions_with_busStops=attractions_with_busStops.withColumn("distance",F.sqrt(F.pow(F.col("long_dist"),2)+F.pow(F.col("lat_dist"),2)))

window = Window.partitionBy(attractions_with_busStops['Name']).orderBy(attractions_with_busStops["distance"])
closest_stops_df=attractions_with_busStops.select('*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= 3)

In [0]:
closest_stops_df=closest_stops_df.dropna()

In [0]:
att_with_stops_df=closest_stops_df.drop("Loc_A","long_dist","lat_dist")

In [0]:
display(att_with_stops_df)

Name,Longitude,Latitude,AddressLocality,busStop,distance,rank
12 Day Giant Irish Adventure Tour - Vagabond Tours of Ireland,-6.26020515750121,53.3493056292934,Dublin City,6059,0.0009190162411700044,2
12 Day Giant Irish Adventure Tour - Vagabond Tours of Ireland,-6.26020515750121,53.3493056292934,Dublin City,279,0.00091978278594194,3
"Cork, Blarney Castle & Queenstown Tour -Railtours Ireland First Class!",-6.250291,53.351003,Dublin City,1500,0.00013353284660488966,1
"Cork, Blarney Castle & Queenstown Tour -Railtours Ireland First Class!",-6.250291,53.351003,Dublin City,4415,0.000484124945274592,2
"Cork, Blarney Castle & Queenstown Tour -Railtours Ireland First Class!",-6.250291,53.351003,Dublin City,497,0.0006246141438642964,3
Dublin Whiskey Tours,-6.25742364309996,53.3456614891671,Dublin City,5192,0.0004825082678941167,1
Dublin Whiskey Tours,-6.25742364309996,53.3456614891671,Dublin City,342,0.000630941063800933,2
Dublin Whiskey Tours,-6.25742364309996,53.3456614891671,Dublin City,336,0.0006596956217142787,3
St. Mary's Pro Cathedral,-6.258885,53.351138,Dublin City,No.,0.0014555562871266,1
Authentic Ireland,-6.21599705967776,53.2755896475813,Sandyford,449,0.0011845012547731,1


In [0]:
att_with_stops_df.write.csv('atrractions_with_busStops.csv',header=True)